llvm.org GIT mirror llvm / ce0c81e
Add regular expression matching support, based on OpenBSD regexec()/regcomp() implementation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@80493 91177308-0d34-0410-b5e6-96231b3b80d8 Torok Edwin 10 years ago
18 changed file(s) with 4532 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
6565 llvm/projects/sample/autoconf
6666 CellSPU backend llvm/lib/Target/CellSPU/README.txt
6767 Google Test llvm/utils/unittest/googletest
68 OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex}
0 .\" $OpenBSD: re_format.7,v 1.14 2007/05/31 19:19:30 jmc Exp $
1 .\"
2 .\" Copyright (c) 1997, Phillip F Knaack. All rights reserved.
3 .\"
4 .\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 .\" Copyright (c) 1992, 1993, 1994
6 .\" The Regents of the University of California. All rights reserved.
7 .\"
8 .\" This code is derived from software contributed to Berkeley by
9 .\" Henry Spencer.
10 .\"
11 .\" Redistribution and use in source and binary forms, with or without
12 .\" modification, are permitted provided that the following conditions
13 .\" are met:
14 .\" 1. Redistributions of source code must retain the above copyright
15 .\" notice, this list of conditions and the following disclaimer.
16 .\" 2. Redistributions in binary form must reproduce the above copyright
17 .\" notice, this list of conditions and the following disclaimer in the
18 .\" documentation and/or other materials provided with the distribution.
19 .\" 3. Neither the name of the University nor the names of its contributors
20 .\" may be used to endorse or promote products derived from this software
21 .\" without specific prior written permission.
22 .\"
23 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 .\" SUCH DAMAGE.
34 .\"
35 .\" @(#)re_format.7 8.3 (Berkeley) 3/20/94
36 .\"
37 .Dd $Mdocdate: May 31 2007 $
38 .Dt RE_FORMAT 7
39 .Os
40 .Sh NAME
41 .Nm re_format
42 .Nd POSIX regular expressions
43 .Sh DESCRIPTION
44 Regular expressions (REs),
45 as defined in
46 .St -p1003.1-2004 ,
47 come in two forms:
48 basic regular expressions
49 (BREs)
50 and extended regular expressions
51 (EREs).
52 Both forms of regular expressions are supported
53 by the interfaces described in
54 .Xr regex 3 .
55 Applications dealing with regular expressions
56 may use one or the other form
57 (or indeed both).
58 For example,
59 .Xr ed 1
60 uses BREs,
61 whilst
62 .Xr egrep 1
63 talks EREs.
64 Consult the manual page for the specific application to find out which
65 it uses.
66 .Pp
67 POSIX leaves some aspects of RE syntax and semantics open;
68 .Sq **
69 marks decisions on these aspects that
70 may not be fully portable to other POSIX implementations.
71 .Pp
72 This manual page first describes regular expressions in general,
73 specifically extended regular expressions,
74 and then discusses differences between them and basic regular expressions.
75 .Sh EXTENDED REGULAR EXPRESSIONS
76 An ERE is one** or more non-empty**
77 .Em branches ,
78 separated by
79 .Sq \*(Ba .
80 It matches anything that matches one of the branches.
81 .Pp
82 A branch is one** or more
83 .Em pieces ,
84 concatenated.
85 It matches a match for the first, followed by a match for the second, etc.
86 .Pp
87 A piece is an
88 .Em atom
89 possibly followed by a single**
90 .Sq * ,
91 .Sq + ,
92 .Sq ?\& ,
93 or
94 .Em bound .
95 An atom followed by
96 .Sq *
97 matches a sequence of 0 or more matches of the atom.
98 An atom followed by
99 .Sq +
100 matches a sequence of 1 or more matches of the atom.
101 An atom followed by
102 .Sq ?\&
103 matches a sequence of 0 or 1 matches of the atom.
104 .Pp
105 A bound is
106 .Sq {
107 followed by an unsigned decimal integer,
108 possibly followed by
109 .Sq ,\&
110 possibly followed by another unsigned decimal integer,
111 always followed by
112 .Sq } .
113 The integers must lie between 0 and
114 .Dv RE_DUP_MAX
115 (255**) inclusive,
116 and if there are two of them, the first may not exceed the second.
117 An atom followed by a bound containing one integer
118 .Ar i
119 and no comma matches
120 a sequence of exactly
121 .Ar i
122 matches of the atom.
123 An atom followed by a bound
124 containing one integer
125 .Ar i
126 and a comma matches
127 a sequence of
128 .Ar i
129 or more matches of the atom.
130 An atom followed by a bound
131 containing two integers
132 .Ar i
133 and
134 .Ar j
135 matches a sequence of
136 .Ar i
137 through
138 .Ar j
139 (inclusive) matches of the atom.
140 .Pp
141 An atom is a regular expression enclosed in
142 .Sq ()
143 (matching a part of the regular expression),
144 an empty set of
145 .Sq ()
146 (matching the null string)**,
147 a
148 .Em bracket expression
149 (see below),
150 .Sq .\&
151 (matching any single character),
152 .Sq ^
153 (matching the null string at the beginning of a line),
154 .Sq $
155 (matching the null string at the end of a line),
156 a
157 .Sq \e
158 followed by one of the characters
159 .Sq ^.[$()|*+?{\e
160 (matching that character taken as an ordinary character),
161 a
162 .Sq \e
163 followed by any other character**
164 (matching that character taken as an ordinary character,
165 as if the
166 .Sq \e
167 had not been present**),
168 or a single character with no other significance (matching that character).
169 A
170 .Sq {
171 followed by a character other than a digit is an ordinary character,
172 not the beginning of a bound**.
173 It is illegal to end an RE with
174 .Sq \e .
175 .Pp
176 A bracket expression is a list of characters enclosed in
177 .Sq [] .
178 It normally matches any single character from the list (but see below).
179 If the list begins with
180 .Sq ^ ,
181 it matches any single character
182 .Em not
183 from the rest of the list
184 (but see below).
185 If two characters in the list are separated by
186 .Sq - ,
187 this is shorthand for the full
188 .Em range
189 of characters between those two (inclusive) in the
190 collating sequence, e.g.\&
191 .Sq [0-9]
192 in ASCII matches any decimal digit.
193 It is illegal** for two ranges to share an endpoint, e.g.\&
194 .Sq a-c-e .
195 Ranges are very collating-sequence-dependent,
196 and portable programs should avoid relying on them.
197 .Pp
198 To include a literal
199 .Sq ]\&
200 in the list, make it the first character
201 (following a possible
202 .Sq ^ ) .
203 To include a literal
204 .Sq - ,
205 make it the first or last character,
206 or the second endpoint of a range.
207 To use a literal
208 .Sq -
209 as the first endpoint of a range,
210 enclose it in
211 .Sq [.
212 and
213 .Sq .]
214 to make it a collating element (see below).
215 With the exception of these and some combinations using
216 .Sq [
217 (see next paragraphs),
218 all other special characters, including
219 .Sq \e ,
220 lose their special significance within a bracket expression.
221 .Pp
222 Within a bracket expression, a collating element
223 (a character,
224 a multi-character sequence that collates as if it were a single character,
225 or a collating-sequence name for either)
226 enclosed in
227 .Sq [.
228 and
229 .Sq .]
230 stands for the sequence of characters of that collating element.
231 The sequence is a single element of the bracket expression's list.
232 A bracket expression containing a multi-character collating element
233 can thus match more than one character,
234 e.g. if the collating sequence includes a
235 .Sq ch
236 collating element,
237 then the RE
238 .Sq [[.ch.]]*c
239 matches the first five characters of
240 .Sq chchcc .
241 .Pp
242 Within a bracket expression, a collating element enclosed in
243 .Sq [=
244 and
245 .Sq =]
246 is an equivalence class, standing for the sequences of characters
247 of all collating elements equivalent to that one, including itself.
248 (If there are no other equivalent collating elements,
249 the treatment is as if the enclosing delimiters were
250 .Sq [.
251 and
252 .Sq .] . )
253 For example, if
254 .Sq x
255 and
256 .Sq y
257 are the members of an equivalence class,
258 then
259 .Sq [[=x=]] ,
260 .Sq [[=y=]] ,
261 and
262 .Sq [xy]
263 are all synonymous.
264 An equivalence class may not** be an endpoint of a range.
265 .Pp
266 Within a bracket expression, the name of a
267 .Em character class
268 enclosed
269 in
270 .Sq [:
271 and
272 .Sq :]
273 stands for the list of all characters belonging to that class.
274 Standard character class names are:
275 .Bd -literal -offset indent
276 alnum digit punct
277 alpha graph space
278 blank lower upper
279 cntrl print xdigit
280 .Ed
281 .Pp
282 These stand for the character classes defined in
283 .Xr ctype 3 .
284 A locale may provide others.
285 A character class may not be used as an endpoint of a range.
286 .Pp
287 There are two special cases** of bracket expressions:
288 the bracket expressions
289 .Sq [[:<:]]
290 and
291 .Sq [[:>:]]
292 match the null string at the beginning and end of a word, respectively.
293 A word is defined as a sequence of
294 characters starting and ending with a word character
295 which is neither preceded nor followed by
296 word characters.
297 A word character is an
298 .Em alnum
299 character (as defined by
300 .Xr ctype 3 )
301 or an underscore.
302 This is an extension,
303 compatible with but not specified by POSIX,
304 and should be used with
305 caution in software intended to be portable to other systems.
306 .Pp
307 In the event that an RE could match more than one substring of a given
308 string,
309 the RE matches the one starting earliest in the string.
310 If the RE could match more than one substring starting at that point,
311 it matches the longest.
312 Subexpressions also match the longest possible substrings, subject to
313 the constraint that the whole match be as long as possible,
314 with subexpressions starting earlier in the RE taking priority over
315 ones starting later.
316 Note that higher-level subexpressions thus take priority over
317 their lower-level component subexpressions.
318 .Pp
319 Match lengths are measured in characters, not collating elements.
320 A null string is considered longer than no match at all.
321 For example,
322 .Sq bb*
323 matches the three middle characters of
324 .Sq abbbc ;
325 .Sq (wee|week)(knights|nights)
326 matches all ten characters of
327 .Sq weeknights ;
328 when
329 .Sq (.*).*
330 is matched against
331 .Sq abc ,
332 the parenthesized subexpression matches all three characters;
333 and when
334 .Sq (a*)*
335 is matched against
336 .Sq bc ,
337 both the whole RE and the parenthesized subexpression match the null string.
338 .Pp
339 If case-independent matching is specified,
340 the effect is much as if all case distinctions had vanished from the
341 alphabet.
342 When an alphabetic that exists in multiple cases appears as an
343 ordinary character outside a bracket expression, it is effectively
344 transformed into a bracket expression containing both cases,
345 e.g.\&
346 .Sq x
347 becomes
348 .Sq [xX] .
349 When it appears inside a bracket expression,
350 all case counterparts of it are added to the bracket expression,
351 so that, for example,
352 .Sq [x]
353 becomes
354 .Sq [xX]
355 and
356 .Sq [^x]
357 becomes
358 .Sq [^xX] .
359 .Pp
360 No particular limit is imposed on the length of REs**.
361 Programs intended to be portable should not employ REs longer
362 than 256 bytes,
363 as an implementation can refuse to accept such REs and remain
364 POSIX-compliant.
365 .Pp
366 The following is a list of extended regular expressions:
367 .Bl -tag -width Ds
368 .It Ar c
369 Any character
370 .Ar c
371 not listed below matches itself.
372 .It \e Ns Ar c
373 Any backslash-escaped character
374 .Ar c
375 matches itself.
376 .It \&.
377 Matches any single character that is not a newline
378 .Pq Sq \en .
379 .It Bq Ar char-class
380 Matches any single character in
381 .Ar char-class .
382 To include a
383 .Ql \&]
384 in
385 .Ar char-class ,
386 it must be the first character.
387 A range of characters may be specified by separating the end characters
388 of the range with a
389 .Ql - ;
390 e.g.\&
391 .Ar a-z
392 specifies the lower case characters.
393 The following literal expressions can also be used in
394 .Ar char-class
395 to specify sets of characters:
396 .Bd -unfilled -offset indent
397 [:alnum:] [:cntrl:] [:lower:] [:space:]
398 [:alpha:] [:digit:] [:print:] [:upper:]
399 [:blank:] [:graph:] [:punct:] [:xdigit:]
400 .Ed
401 .Pp
402 If
403 .Ql -
404 appears as the first or last character of
405 .Ar char-class ,
406 then it matches itself.
407 All other characters in
408 .Ar char-class
409 match themselves.
410 .Pp
411 Patterns in
412 .Ar char-class
413 of the form
414 .Eo [.
415 .Ar col-elm
416 .Ec .]\&
417 or
418 .Eo [=
419 .Ar col-elm
420 .Ec =]\& ,
421 where
422 .Ar col-elm
423 is a collating element, are interpreted according to
424 .Xr setlocale 3
425 .Pq not currently supported .
426 .It Bq ^ Ns Ar char-class
427 Matches any single character, other than newline, not in
428 .Ar char-class .
429 .Ar char-class
430 is defined as above.
431 .It ^
432 If
433 .Sq ^
434 is the first character of a regular expression, then it
435 anchors the regular expression to the beginning of a line.
436 Otherwise, it matches itself.
437 .It $
438 If
439 .Sq $
440 is the last character of a regular expression,
441 it anchors the regular expression to the end of a line.
442 Otherwise, it matches itself.
443 .It [[:<:]]
444 Anchors the single character regular expression or subexpression
445 immediately following it to the beginning of a word.
446 .It [[:>:]]
447 Anchors the single character regular expression or subexpression
448 immediately following it to the end of a word.
449 .It Pq Ar re
450 Defines a subexpression
451 .Ar re .
452 Any set of characters enclosed in parentheses
453 matches whatever the set of characters without parentheses matches
454 (that is a long-winded way of saying the constructs
455 .Sq (re)
456 and
457 .Sq re
458 match identically).
459 .It *
460 Matches the single character regular expression or subexpression
461 immediately preceding it zero or more times.
462 If
463 .Sq *
464 is the first character of a regular expression or subexpression,
465 then it matches itself.
466 The
467 .Sq *
468 operator sometimes yields unexpected results.
469 For example, the regular expression
470 .Ar b*
471 matches the beginning of the string
472 .Qq abbb
473 (as opposed to the substring
474 .Qq bbb ) ,
475 since a null match is the only leftmost match.
476 .It +
477 Matches the singular character regular expression
478 or subexpression immediately preceding it
479 one or more times.
480 .It ?
481 Matches the singular character regular expression
482 or subexpression immediately preceding it
483 0 or 1 times.
484 .Sm off
485 .It Xo
486 .Pf { Ar n , m No }\ \&
487 .Pf { Ar n , No }\ \&
488 .Pf { Ar n No }
489 .Xc
490 .Sm on
491 Matches the single character regular expression or subexpression
492 immediately preceding it at least
493 .Ar n
494 and at most
495 .Ar m
496 times.
497 If
498 .Ar m
499 is omitted, then it matches at least
500 .Ar n
501 times.
502 If the comma is also omitted, then it matches exactly
503 .Ar n
504 times.
505 .It \*(Ba
506 Used to separate patterns.
507 For example,
508 the pattern
509 .Sq cat\*(Badog
510 matches either
511 .Sq cat
512 or
513 .Sq dog .
514 .El
515 .Sh BASIC REGULAR EXPRESSIONS
516 Basic regular expressions differ in several respects:
517 .Bl -bullet -offset 3n
518 .It
519 .Sq \*(Ba ,
520 .Sq + ,
521 and
522 .Sq ?\&
523 are ordinary characters and there is no equivalent
524 for their functionality.
525 .It
526 The delimiters for bounds are
527 .Sq \e{
528 and
529 .Sq \e} ,
530 with
531 .Sq {
532 and
533 .Sq }
534 by themselves ordinary characters.
535 .It
536 The parentheses for nested subexpressions are
537 .Sq \e(
538 and
539 .Sq \e) ,
540 with
541 .Sq (
542 and
543 .Sq )\&
544 by themselves ordinary characters.
545 .It
546 .Sq ^
547 is an ordinary character except at the beginning of the
548 RE or** the beginning of a parenthesized subexpression.
549 .It
550 .Sq $
551 is an ordinary character except at the end of the
552 RE or** the end of a parenthesized subexpression.
553 .It
554 .Sq *
555 is an ordinary character if it appears at the beginning of the
556 RE or the beginning of a parenthesized subexpression
557 (after a possible leading
558 .Sq ^ ) .
559 .It
560 Finally, there is one new type of atom, a
561 .Em back-reference :
562 .Sq \e
563 followed by a non-zero decimal digit
564 .Ar d
565 matches the same sequence of characters matched by the
566 .Ar d Ns th
567 parenthesized subexpression
568 (numbering subexpressions by the positions of their opening parentheses,
569 left to right),
570 so that, for example,
571 .Sq \e([bc]\e)\e1
572 matches
573 .Sq bb\&
574 or
575 .Sq cc
576 but not
577 .Sq bc .
578 .El
579 .Pp
580 The following is a list of basic regular expressions:
581 .Bl -tag -width Ds
582 .It Ar c
583 Any character
584 .Ar c
585 not listed below matches itself.
586 .It \e Ns Ar c
587 Any backslash-escaped character
588 .Ar c ,
589 except for
590 .Sq { ,
591 .Sq } ,
592 .Sq \&( ,
593 and
594 .Sq \&) ,
595 matches itself.
596 .It \&.
597 Matches any single character that is not a newline
598 .Pq Sq \en .
599 .It Bq Ar char-class
600 Matches any single character in
601 .Ar char-class .
602 To include a
603 .Ql \&]
604 in
605 .Ar char-class ,
606 it must be the first character.
607 A range of characters may be specified by separating the end characters
608 of the range with a
609 .Ql - ;
610 e.g.\&
611 .Ar a-z
612 specifies the lower case characters.
613 The following literal expressions can also be used in
614 .Ar char-class
615 to specify sets of characters:
616 .Bd -unfilled -offset indent
617 [:alnum:] [:cntrl:] [:lower:] [:space:]
618 [:alpha:] [:digit:] [:print:] [:upper:]
619 [:blank:] [:graph:] [:punct:] [:xdigit:]
620 .Ed
621 .Pp
622 If
623 .Ql -
624 appears as the first or last character of
625 .Ar char-class ,
626 then it matches itself.
627 All other characters in
628 .Ar char-class
629 match themselves.
630 .Pp
631 Patterns in
632 .Ar char-class
633 of the form
634 .Eo [.
635 .Ar col-elm
636 .Ec .]\&
637 or
638 .Eo [=
639 .Ar col-elm
640 .Ec =]\& ,
641 where
642 .Ar col-elm
643 is a collating element, are interpreted according to
644 .Xr setlocale 3
645 .Pq not currently supported .
646 .It Bq ^ Ns Ar char-class
647 Matches any single character, other than newline, not in
648 .Ar char-class .
649 .Ar char-class
650 is defined as above.
651 .It ^
652 If
653 .Sq ^
654 is the first character of a regular expression, then it
655 anchors the regular expression to the beginning of a line.
656 Otherwise, it matches itself.
657 .It $
658 If
659 .Sq $
660 is the last character of a regular expression,
661 it anchors the regular expression to the end of a line.
662 Otherwise, it matches itself.
663 .It [[:<:]]
664 Anchors the single character regular expression or subexpression
665 immediately following it to the beginning of a word.
666 .It [[:>:]]
667 Anchors the single character regular expression or subexpression
668 immediately following it to the end of a word.
669 .It \e( Ns Ar re Ns \e)
670 Defines a subexpression
671 .Ar re .
672 Subexpressions may be nested.
673 A subsequent backreference of the form
674 .Pf \e Ns Ar n ,
675 where
676 .Ar n
677 is a number in the range [1,9], expands to the text matched by the
678 .Ar n Ns th
679 subexpression.
680 For example, the regular expression
681 .Ar \e(.*\e)\e1
682 matches any string consisting of identical adjacent substrings.
683 Subexpressions are ordered relative to their left delimiter.
684 .It *
685 Matches the single character regular expression or subexpression
686 immediately preceding it zero or more times.
687 If
688 .Sq *
689 is the first character of a regular expression or subexpression,
690 then it matches itself.
691 The
692 .Sq *
693 operator sometimes yields unexpected results.
694 For example, the regular expression
695 .Ar b*
696 matches the beginning of the string
697 .Qq abbb
698 (as opposed to the substring
699 .Qq bbb ) ,
700 since a null match is the only leftmost match.
701 .Sm off
702 .It Xo
703 .Pf \e{ Ar n , m No \e}\ \&
704 .Pf \e{ Ar n , No \e}\ \&
705 .Pf \e{ Ar n No \e}
706 .Xc
707 .Sm on
708 Matches the single character regular expression or subexpression
709 immediately preceding it at least
710 .Ar n
711 and at most
712 .Ar m
713 times.
714 If
715 .Ar m
716 is omitted, then it matches at least
717 .Ar n
718 times.
719 If the comma is also omitted, then it matches exactly
720 .Ar n
721 times.
722 .El
723 .Sh SEE ALSO
724 .Xr ctype 3 ,
725 .Xr regex 3
726 .Sh STANDARDS
727 .St -p1003.1-2004 :
728 Base Definitions, Chapter 9 (Regular Expressions).
729 .Sh BUGS
730 Having two kinds of REs is a botch.
731 .Pp
732 The current POSIX spec says that
733 .Sq )\&
734 is an ordinary character in the absence of an unmatched
735 .Sq ( ;
736 this was an unintentional result of a wording error,
737 and change is likely.
738 Avoid relying on it.
739 .Pp
740 Back-references are a dreadful botch,
741 posing major problems for efficient implementations.
742 They are also somewhat vaguely defined
743 (does
744 .Sq a\e(\e(b\e)*\e2\e)*d
745 match
746 .Sq abbbd ? ) .
747 Avoid using them.
748 .Pp
749 POSIX's specification of case-independent matching is vague.
750 The
751 .Dq one case implies all cases
752 definition given above
753 is the current consensus among implementors as to the right interpretation.
754 .Pp
755 The syntax for word boundaries is incredibly ugly.
0 //===-- Regex.h - Regular Expression matcher implementation -*- C++ -*-----===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a POSIX regular expression matcher.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/ADT/StringRef.h"
15
16 struct llvm_regex;
17 namespace llvm {
18 class Regex {
19 public:
20 enum {
21 /// Compile with support for subgroup matches, this is just to make
22 /// constructs like Regex("...", 0) more readable as Regex("...", Sub).
23 Sub=0,
24 /// Compile for matching that ignores upper/lower case distinctions.
25 IgnoreCase=1,
26 /// Compile for matching that need only report success or failure,
27 /// not what was matched.
28 NoSub=2,
29 /// Compile for newline-sensitive matching. With this flag '[^' bracket
30 /// expressions and '.' never match newline. A ^ anchor matches the
31 /// null string after any newline in the string in addition to its normal
32 /// function, and the $ anchor matches the null string before any
33 /// newline in the string in addition to its normal function.
34 Newline=4
35 };
36
37 /// Compiles the given POSIX Extended Regular Expression \arg Regex.
38 /// This implementation supports regexes and matching strings with embedded
39 /// NUL characters.
40 Regex(const StringRef &Regex, unsigned Flags=NoSub);
41 ~Regex();
42
43 /// isValid - returns the error encountered during regex compilation, or
44 /// matching, if any.
45 bool isValid(std::string &Error);
46
47 /// matches - Match the regex against a given \arg String.
48 ///
49 /// \param Matches - If given, on a succesful match this will be filled in
50 /// with references to the matched group expressions (inside \arg String),
51 /// the first group is always the entire pattern.
52 /// By default the regex is compiled with NoSub, which disables support for
53 /// Matches.
54 /// For this feature to be enabled you must construct the regex using
55 /// Regex("...", Regex::Sub) constructor.
56
57 bool match(const StringRef &String, SmallVectorImpl *Matches=0);
58 private:
59 struct llvm_regex *preg;
60 int error;
61 bool sub;
62 };
63 }
3131 Twine.cpp
3232 raw_os_ostream.cpp
3333 raw_ostream.cpp
34 Regex.cpp
35 regcomp.c
36 regerror.c
37 regexec.c
38 regfree.c
39 regstrlcpy.c
3440 )
3541
3642 target_link_libraries (LLVMSupport LLVMSystem)
0 $OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
1
2 Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
3 This software is not subject to any license of the American Telephone
4 and Telegraph Company or of the Regents of the University of California.
5
6 Permission is granted to anyone to use this software for any purpose on
7 any computer system, and to alter it and redistribute it, subject
8 to the following restrictions:
9
10 1. The author is not responsible for the consequences of use of this
11 software, no matter how awful, even if they arise from flaws in it.
12
13 2. The origin of this software must not be misrepresented, either by
14 explicit claim or by omission. Since few users ever read sources,
15 credits must appear in the documentation.
16
17 3. Altered versions must be plainly marked as such, and must not be
18 misrepresented as being the original software. Since few users
19 ever read sources, credits must appear in the documentation.
20
21 4. This notice may not be removed or altered.
22
23 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
24 /*-
25 * Copyright (c) 1994
26 * The Regents of the University of California. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94
53 */
0 //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a POSIX regular expression matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 #include "llvm/Support/Regex.h"
13 #include "llvm/Support/ErrorHandling.h"
14 #include "llvm/Support/raw_ostream.h"
15 #include "regex_impl.h"
16 #include
17
18 using namespace llvm;
19 Regex::Regex(const StringRef ®ex, unsigned Flags)
20 {
21 unsigned flags = 0;
22 preg = new struct llvm_regex;
23 preg->re_endp = regex.end();
24 if (Flags & IgnoreCase)
25 flags |= REG_ICASE;
26 if (Flags & NoSub) {
27 flags |= REG_NOSUB;
28 sub = false;
29 } else {
30 sub = true;
31 }
32 if (Flags & Newline)
33 flags |= REG_NEWLINE;
34 error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND);
35 }
36
37 bool Regex::isValid(std::string &Error)
38 {
39 if (!error)
40 return true;
41
42 size_t len = llvm_regerror(error, preg, NULL, 0);
43 char *errbuff = new char[len];
44 llvm_regerror(error, preg, errbuff, len);
45 Error.assign(errbuff);
46 return false;
47 }
48
49 Regex::~Regex()
50 {
51 llvm_regfree(preg);
52 delete preg;
53 }
54
55 bool Regex::match(const StringRef &String, SmallVectorImpl *Matches)
56 {
57 unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
58
59 if (Matches) {
60 assert(sub && "Substring matching requested but pattern compiled without");
61 Matches->clear();
62 }
63
64 // pmatch needs to have at least one element.
65 SmallVector pm;
66 pm.resize(nmatch > 0 ? nmatch : 1);
67 pm[0].rm_so = 0;
68 pm[0].rm_eo = String.size();
69
70 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
71
72 if (rc == REG_NOMATCH)
73 return false;
74 if (rc != 0) {
75 // regexec can fail due to invalid pattern or running out of memory.
76 error = rc;
77 return false;
78 }
79
80 // There was a match.
81
82 if (Matches) { // match position requested
83 for (unsigned i=0;i
84 if (pm[i].rm_so == -1) {
85 // this group didn't match
86 Matches->push_back(StringRef());
87 continue;
88 }
89 assert(pm[i].rm_eo > pm[i].rm_so);
90 Matches->push_back(StringRef(String.data()+pm[i].rm_so,
91 pm[i].rm_eo-pm[i].rm_so));
92 }
93 }
94
95 return true;
96 }
0 /*-
1 * This code is derived from OpenBSD's libc/regex, original license follows:
2 *
3 * This code is derived from OpenBSD's libc/regex, original license follows:
4 *
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 * The Regents of the University of California. All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Henry Spencer.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)cclass.h 8.3 (Berkeley) 3/20/94
37 */
38
39 /* character-class table */
40 static struct cclass {
41 const char *name;
42 const char *chars;
43 const char *multis;
44 } cclasses[] = {
45 { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
46 0123456789", ""} ,
47 { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
48 ""} ,
49 { "blank", " \t", ""} ,
50 { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
51 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
52 { "digit", "0123456789", ""} ,
53 { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
54 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
55 ""} ,
56 { "lower", "abcdefghijklmnopqrstuvwxyz",
57 ""} ,
58 { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
59 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
60 ""} ,
61 { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
62 ""} ,
63 { "space", "\t\n\v\f\r ", ""} ,
64 { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
65 ""} ,
66 { "xdigit", "0123456789ABCDEFabcdef",
67 ""} ,
68 { NULL, 0, "" }
69 };
0 /*-
1 * This code is derived from OpenBSD's libc/regex, original license follows:
2 *
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)cname.h 8.3 (Berkeley) 3/20/94
35 */
36
37 /* character-name table */
38 static struct cname {
39 const char *name;
40 char code;
41 } cnames[] = {
42 { "NUL", '\0' },
43 { "SOH", '\001' },
44 { "STX", '\002' },
45 { "ETX", '\003' },
46 { "EOT", '\004' },
47 { "ENQ", '\005' },
48 { "ACK", '\006' },
49 { "BEL", '\007' },
50 { "alert", '\007' },
51 { "BS", '\010' },
52 { "backspace", '\b' },
53 { "HT", '\011' },
54 { "tab", '\t' },
55 { "LF", '\012' },
56 { "newline", '\n' },
57 { "VT", '\013' },
58 { "vertical-tab", '\v' },
59 { "FF", '\014' },
60 { "form-feed", '\f' },
61 { "CR", '\015' },
62 { "carriage-return", '\r' },
63 { "SO", '\016' },
64 { "SI", '\017' },
65 { "DLE", '\020' },
66 { "DC1", '\021' },
67 { "DC2", '\022' },
68 { "DC3", '\023' },
69 { "DC4", '\024' },
70 { "NAK", '\025' },
71 { "SYN", '\026' },
72 { "ETB", '\027' },
73 { "CAN", '\030' },
74 { "EM", '\031' },
75 { "SUB", '\032' },
76 { "ESC", '\033' },
77 { "IS4", '\034' },
78 { "FS", '\034' },
79 { "IS3", '\035' },
80 { "GS", '\035' },
81 { "IS2", '\036' },
82 { "RS", '\036' },
83 { "IS1", '\037' },
84 { "US", '\037' },
85 { "space", ' ' },
86 { "exclamation-mark", '!' },
87 { "quotation-mark", '"' },
88 { "number-sign", '#' },
89 { "dollar-sign", '$' },
90 { "percent-sign", '%' },
91 { "ampersand", '&' },
92 { "apostrophe", '\'' },
93 { "left-parenthesis", '(' },
94 { "right-parenthesis", ')' },
95 { "asterisk", '*' },
96 { "plus-sign", '+' },
97 { "comma", ',' },
98 { "hyphen", '-' },
99 { "hyphen-minus", '-' },
100 { "period", '.' },
101 { "full-stop", '.' },
102 { "slash", '/' },
103 { "solidus", '/' },
104 { "zero", '0' },
105 { "one", '1' },
106 { "two", '2' },
107 { "three", '3' },
108 { "four", '4' },
109 { "five", '5' },
110 { "six", '6' },
111 { "seven", '7' },
112 { "eight", '8' },
113 { "nine", '9' },
114 { "colon", ':' },
115 { "semicolon", ';' },
116 { "less-than-sign", '<' },
117 { "equals-sign", '=' },
118 { "greater-than-sign", '>' },
119 { "question-mark", '?' },
120 { "commercial-at", '@' },
121 { "left-square-bracket", '[' },
122 { "backslash", '\\' },
123 { "reverse-solidus", '\\' },
124 { "right-square-bracket", ']' },
125 { "circumflex", '^' },
126 { "circumflex-accent", '^' },
127 { "underscore", '_' },
128 { "low-line", '_' },
129 { "grave-accent", '`' },
130 { "left-brace", '{' },
131 { "left-curly-bracket", '{' },
132 { "vertical-line", '|' },
133 { "right-brace", '}' },
134 { "right-curly-bracket", '}' },
135 { "tilde", '~' },
136 { "DEL", '\177' },
137 { NULL, 0 }
138 };
0 /*-
1 * This code is derived from OpenBSD's libc/regex, original license follows:
2 *
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
35 */
36
37 #include
38 #include
39 #include
40 #include
41 #include
42 #include
43 #include "regex_impl.h"
44
45 #include "regutils.h"
46 #include "regex2.h"
47
48 #include "regcclass.h"
49 #include "regcname.h"
50
51 /*
52 * parse structure, passed up and down to avoid global variables and
53 * other clumsinesses
54 */
55 struct parse {
56 char *next; /* next character in RE */
57 char *end; /* end of string (-> NUL normally) */
58 int error; /* has an error been seen? */
59 sop *strip; /* malloced strip */
60 sopno ssize; /* malloced strip size (allocated) */
61 sopno slen; /* malloced strip length (used) */
62 int ncsalloc; /* number of csets allocated */
63 struct re_guts *g;
64 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
65 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
66 sopno pend[NPAREN]; /* -> ) ([0] unused) */
67 };
68
69 static void p_ere(struct parse *, int);
70 static void p_ere_exp(struct parse *);
71 static void p_str(struct parse *);
72 static void p_bre(struct parse *, int, int);
73 static int p_simp_re(struct parse *, int);
74 static int p_count(struct parse *);
75 static void p_bracket(struct parse *);
76 static void p_b_term(struct parse *, cset *);
77 static void p_b_cclass(struct parse *, cset *);
78 static void p_b_eclass(struct parse *, cset *);
79 static char p_b_symbol(struct parse *);
80 static char p_b_coll_elem(struct parse *, int);
81 static char othercase(int);
82 static void bothcases(struct parse *, int);
83 static void ordinary(struct parse *, int);
84 static void nonnewline(struct parse *);
85 static void repeat(struct parse *, sopno, int, int);
86 static int seterr(struct parse *, int);
87 static cset *allocset(struct parse *);
88 static void freeset(struct parse *, cset *);
89 static int freezeset(struct parse *, cset *);
90 static int firstch(struct parse *, cset *);
91 static int nch(struct parse *, cset *);
92 static void mcadd(struct parse *, cset *, const char *);
93 static void mcinvert(struct parse *, cset *);
94 static void mccase(struct parse *, cset *);
95 static int isinsets(struct re_guts *, int);
96 static int samesets(struct re_guts *, int, int);
97 static void categorize(struct parse *, struct re_guts *);
98 static sopno dupl(struct parse *, sopno, sopno);
99 static void doemit(struct parse *, sop, size_t);
100 static void doinsert(struct parse *, sop, size_t, sopno);
101 static void dofwd(struct parse *, sopno, sop);
102 static void enlarge(struct parse *, sopno);
103 static void stripsnug(struct parse *, struct re_guts *);
104 static void findmust(struct parse *, struct re_guts *);
105 static sopno pluscount(struct parse *, struct re_guts *);
106
107 static char nuls[10]; /* place to point scanner in event of error */
108
109 /*
110 * macros for use with parse structure
111 * BEWARE: these know that the parse structure is named `p' !!!
112 */
113 #define PEEK() (*p->next)
114 #define PEEK2() (*(p->next+1))
115 #define MORE() (p->next < p->end)
116 #define MORE2() (p->next+1 < p->end)
117 #define SEE(c) (MORE() && PEEK() == (c))
118 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
119 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
120 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
121 #define NEXT() (p->next++)
122 #define NEXT2() (p->next += 2)
123 #define NEXTn(n) (p->next += (n))
124 #define GETNEXT() (*p->next++)
125 #define SETERROR(e) seterr(p, (e))
126 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
127 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
128 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
129 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
130 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
131 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
132 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
133 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
134 #define HERE() (p->slen)
135 #define THERE() (p->slen - 1)
136 #define THERETHERE() (p->slen - 2)
137 #define DROP(n) (p->slen -= (n))
138
139 #ifdef _POSIX2_RE_DUP_MAX
140 #define DUPMAX _POSIX2_RE_DUP_MAX
141 #else
142 #define DUPMAX 255
143 #endif
144
145 #ifndef NDEBUG
146 static int never = 0; /* for use in asserts; shuts lint up */
147 #else
148 #define never 0 /* some s have bugs too */
149 #endif
150
151 /*
152 - llvm_regcomp - interface for parser and compilation
153 */
154 int /* 0 success, otherwise REG_something */
155 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
156 {
157 struct parse pa;
158 struct re_guts *g;
159 struct parse *p = &pa;
160 int i;
161 size_t len;
162 #ifdef REDEBUG
163 # define GOODFLAGS(f) (f)
164 #else
165 # define GOODFLAGS(f) ((f)&~REG_DUMP)
166 #endif
167
168 cflags = GOODFLAGS(cflags);
169 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
170 return(REG_INVARG);
171
172 if (cflags®_PEND) {
173 if (preg->re_endp < pattern)
174 return(REG_INVARG);
175 len = preg->re_endp - pattern;
176 } else
177 len = strlen((const char *)pattern);
178
179 /* do the mallocs early so failure handling is easy */
180 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
181 (NC-1)*sizeof(cat_t));
182 if (g == NULL)
183 return(REG_ESPACE);
184 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
185 p->strip = (sop *)calloc(p->ssize, sizeof(sop));
186 p->slen = 0;
187 if (p->strip == NULL) {
188 free((char *)g);
189 return(REG_ESPACE);
190 }
191
192 /* set things up */
193 p->g = g;
194 p->next = (char *)pattern; /* convenience; we do not modify it */
195 p->end = p->next + len;
196 p->error = 0;
197 p->ncsalloc = 0;
198 for (i = 0; i < NPAREN; i++) {
199 p->pbegin[i] = 0;
200 p->pend[i] = 0;
201 }
202 g->csetsize = NC;
203 g->sets = NULL;
204 g->setbits = NULL;
205 g->ncsets = 0;
206 g->cflags = cflags;
207 g->iflags = 0;
208 g->nbol = 0;
209 g->neol = 0;
210 g->must = NULL;
211 g->mlen = 0;
212 g->nsub = 0;
213 g->ncategories = 1; /* category 0 is "everything else" */
214 g->categories = &g->catspace[-(CHAR_MIN)];
215 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
216 g->backrefs = 0;
217
218 /* do it */
219 EMIT(OEND, 0);
220 g->firststate = THERE();
221 if (cflags®_EXTENDED)
222 p_ere(p, OUT);
223 else if (cflags®_NOSPEC)
224 p_str(p);
225 else
226 p_bre(p, OUT, OUT);
227 EMIT(OEND, 0);
228 g->laststate = THERE();
229
230 /* tidy up loose ends and fill things in */
231 categorize(p, g);
232 stripsnug(p, g);
233 findmust(p, g);
234 g->nplus = pluscount(p, g);
235 g->magic = MAGIC2;
236 preg->re_nsub = g->nsub;
237 preg->re_g = g;
238 preg->re_magic = MAGIC1;
239 #ifndef REDEBUG
240 /* not debugging, so can't rely on the assert() in llvm_regexec() */
241 if (g->iflags®EX_BAD)
242 SETERROR(REG_ASSERT);
243 #endif
244
245 /* win or lose, we're done */
246 if (p->error != 0) /* lose */
247 llvm_regfree(preg);
248 return(p->error);
249 }
250
251 /*
252 - p_ere - ERE parser top level, concatenation and alternation
253 */
254 static void
255 p_ere(struct parse *p, int stop) /* character this ERE should end at */
256 {
257 char c;
258 sopno prevback = prevback;
259 sopno prevfwd = prevfwd;
260 sopno conc;
261 int first = 1; /* is this the first alternative? */
262
263 for (;;) {
264 /* do a bunch of concatenated expressions */
265 conc = HERE();
266 while (MORE() && (c = PEEK()) != '|' && c != stop)
267 p_ere_exp(p);
268 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
269
270 if (!EAT('|'))
271 break; /* NOTE BREAK OUT */
272
273 if (first) {
274 INSERT(OCH_, conc); /* offset is wrong */
275 prevfwd = conc;
276 prevback = conc;
277 first = 0;
278 }
279 ASTERN(OOR1, prevback);
280 prevback = THERE();
281 AHEAD(prevfwd); /* fix previous offset */
282 prevfwd = HERE();
283 EMIT(OOR2, 0); /* offset is very wrong */
284 }
285
286 if (!first) { /* tail-end fixups */
287 AHEAD(prevfwd);
288 ASTERN(O_CH, prevback);
289 }
290
291 assert(!MORE() || SEE(stop));
292 }
293
294 /*
295 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
296 */
297 static void
298 p_ere_exp(struct parse *p)
299 {
300 char c;
301 sopno pos;
302 int count;
303 int count2;
304 sopno subno;
305 int wascaret = 0;
306
307 assert(MORE()); /* caller should have ensured this */
308 c = GETNEXT();
309
310 pos = HERE();
311 switch (c) {
312 case '(':
313 REQUIRE(MORE(), REG_EPAREN);
314 p->g->nsub++;
315 subno = p->g->nsub;
316 if (subno < NPAREN)
317 p->pbegin[subno] = HERE();
318 EMIT(OLPAREN, subno);
319 if (!SEE(')'))
320 p_ere(p, ')');
321 if (subno < NPAREN) {
322 p->pend[subno] = HERE();
323 assert(p->pend[subno] != 0);
324 }
325 EMIT(ORPAREN, subno);
326 MUSTEAT(')', REG_EPAREN);
327 break;
328 #ifndef POSIX_MISTAKE
329 case ')': /* happens only if no current unmatched ( */
330 /*
331 * You may ask, why the ifndef? Because I didn't notice
332 * this until slightly too late for 1003.2, and none of the
333 * other 1003.2 regular-expression reviewers noticed it at
334 * all. So an unmatched ) is legal POSIX, at least until
335 * we can get it fixed.
336 */
337 SETERROR(REG_EPAREN);
338 break;
339 #endif
340 case '^':
341 EMIT(OBOL, 0);
342 p->g->iflags |= USEBOL;
343 p->g->nbol++;
344 wascaret = 1;
345 break;
346 case '$':
347 EMIT(OEOL, 0);
348 p->g->iflags |= USEEOL;
349 p->g->neol++;
350 break;
351 case '|':
352 SETERROR(REG_EMPTY);
353 break;
354 case '*':
355 case '+':
356 case '?':
357 SETERROR(REG_BADRPT);
358 break;
359 case '.':
360 if (p->g->cflags®_NEWLINE)
361 nonnewline(p);
362 else
363 EMIT(OANY, 0);
364 break;
365 case '[':
366 p_bracket(p);
367 break;
368 case '\\':
369 REQUIRE(MORE(), REG_EESCAPE);
370 c = GETNEXT();
371 ordinary(p, c);
372 break;
373 case '{': /* okay as ordinary except if digit follows */
374 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
375 /* FALLTHROUGH */
376 default:
377 ordinary(p, c);
378 break;
379 }
380
381 if (!MORE())
382 return;
383 c = PEEK();
384 /* we call { a repetition if followed by a digit */
385 if (!( c == '*' || c == '+' || c == '?' ||
386 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
387 return; /* no repetition, we're done */
388 NEXT();
389
390 REQUIRE(!wascaret, REG_BADRPT);
391 switch (c) {
392 case '*': /* implemented as +? */
393 /* this case does not require the (y|) trick, noKLUDGE */
394 INSERT(OPLUS_, pos);
395 ASTERN(O_PLUS, pos);
396 INSERT(OQUEST_, pos);
397 ASTERN(O_QUEST, pos);
398 break;
399 case '+':
400 INSERT(OPLUS_, pos);
401 ASTERN(O_PLUS, pos);
402 break;
403 case '?':
404 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
405 INSERT(OCH_, pos); /* offset slightly wrong */
406 ASTERN(OOR1, pos); /* this one's right */
407 AHEAD(pos); /* fix the OCH_ */
408 EMIT(OOR2, 0); /* offset very wrong... */
409 AHEAD(THERE()); /* ...so fix it */
410 ASTERN(O_CH, THERETHERE());
411 break;
412 case '{':
413 count = p_count(p);
414 if (EAT(',')) {
415 if (isdigit((uch)PEEK())) {
416 count2 = p_count(p);
417 REQUIRE(count <= count2, REG_BADBR);
418 } else /* single number with comma */
419 count2 = INFINITY;
420 } else /* just a single number */
421 count2 = count;
422 repeat(p, pos, count, count2);
423 if (!EAT('}')) { /* error heuristics */
424 while (MORE() && PEEK() != '}')
425 NEXT();
426 REQUIRE(MORE(), REG_EBRACE);
427 SETERROR(REG_BADBR);
428 }
429 break;
430 }
431
432 if (!MORE())
433 return;
434 c = PEEK();
435 if (!( c == '*' || c == '+' || c == '?' ||
436 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
437 return;
438 SETERROR(REG_BADRPT);
439 }
440
441 /*
442 - p_str - string (no metacharacters) "parser"
443 */
444 static void
445 p_str(struct parse *p)
446 {
447 REQUIRE(MORE(), REG_EMPTY);
448 while (MORE())
449 ordinary(p, GETNEXT());
450 }
451
452 /*
453 - p_bre - BRE parser top level, anchoring and concatenation
454 * Giving end1 as OUT essentially eliminates the end1/end2 check.
455 *
456 * This implementation is a bit of a kludge, in that a trailing $ is first
457 * taken as an ordinary character and then revised to be an anchor. The
458 * only undesirable side effect is that '$' gets included as a character
459 * category in such cases. This is fairly harmless; not worth fixing.
460 * The amount of lookahead needed to avoid this kludge is excessive.
461 */
462 static void
463 p_bre(struct parse *p,
464 int end1, /* first terminating character */
465 int end2) /* second terminating character */
466 {
467 sopno start = HERE();
468 int first = 1; /* first subexpression? */
469 int wasdollar = 0;
470
471 if (EAT('^')) {
472 EMIT(OBOL, 0);
473 p->g->iflags |= USEBOL;
474 p->g->nbol++;
475 }
476 while (MORE() && !SEETWO(end1, end2)) {
477 wasdollar = p_simp_re(p, first);
478 first = 0;
479 }
480 if (wasdollar) { /* oops, that was a trailing anchor */
481 DROP(1);
482 EMIT(OEOL, 0);
483 p->g->iflags |= USEEOL;
484 p->g->neol++;
485 }
486
487 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
488 }
489
490 /*
491 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
492 */
493 static int /* was the simple RE an unbackslashed $? */
494 p_simp_re(struct parse *p,
495 int starordinary) /* is a leading * an ordinary character? */
496 {
497 int c;
498 int count;
499 int count2;
500 sopno pos;
501 int i;
502 sopno subno;
503 # define BACKSL (1<
504
505 pos = HERE(); /* repetion op, if any, covers from here */
506
507 assert(MORE()); /* caller should have ensured this */
508 c = GETNEXT();
509 if (c == '\\') {
510 REQUIRE(MORE(), REG_EESCAPE);
511 c = BACKSL | GETNEXT();
512 }
513 switch (c) {
514 case '.':
515 if (p->g->cflags®_NEWLINE)
516 nonnewline(p);
517 else
518 EMIT(OANY, 0);
519 break;
520 case '[':
521 p_bracket(p);
522 break;
523 case BACKSL|'{':
524 SETERROR(REG_BADRPT);
525 break;
526 case BACKSL|'(':
527 p->g->nsub++;
528 subno = p->g->nsub;
529 if (subno < NPAREN)
530 p->pbegin[subno] = HERE();
531 EMIT(OLPAREN, subno);
532 /* the MORE here is an error heuristic */
533 if (MORE() && !SEETWO('\\', ')'))
534 p_bre(p, '\\', ')');
535 if (subno < NPAREN) {
536 p->pend[subno] = HERE();
537 assert(p->pend[subno] != 0);
538 }
539 EMIT(ORPAREN, subno);
540 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
541 break;
542 case BACKSL|')': /* should not get here -- must be user */
543 case BACKSL|'}':
544 SETERROR(REG_EPAREN);
545 break;
546 case BACKSL|'1':
547 case BACKSL|'2':
548 case BACKSL|'3':
549 case BACKSL|'4':
550 case BACKSL|'5':
551 case BACKSL|'6':
552 case BACKSL|'7':
553 case BACKSL|'8':
554 case BACKSL|'9':
555 i = (c&~BACKSL) - '0';
556 assert(i < NPAREN);
557 if (p->pend[i] != 0) {
558 assert(i <= p->g->nsub);
559 EMIT(OBACK_, i);
560 assert(p->pbegin[i] != 0);
561 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
562 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
563 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
564 EMIT(O_BACK, i);
565 } else
566 SETERROR(REG_ESUBREG);
567 p->g->backrefs = 1;
568 break;
569 case '*':
570 REQUIRE(starordinary, REG_BADRPT);
571 /* FALLTHROUGH */
572 default:
573 ordinary(p, (char)c);
574 break;
575 }
576
577 if (EAT('*')) { /* implemented as +? */
578 /* this case does not require the (y|) trick, noKLUDGE */
579 INSERT(OPLUS_, pos);
580 ASTERN(O_PLUS, pos);
581 INSERT(OQUEST_, pos);
582 ASTERN(O_QUEST, pos);
583 } else if (EATTWO('\\', '{')) {
584 count = p_count(p);
585 if (EAT(',')) {
586 if (MORE() && isdigit((uch)PEEK())) {
587 count2 = p_count(p);
588 REQUIRE(count <= count2, REG_BADBR);
589 } else /* single number with comma */
590 count2 = INFINITY;
591 } else /* just a single number */
592 count2 = count;
593 repeat(p, pos, count, count2);
594 if (!EATTWO('\\', '}')) { /* error heuristics */
595 while (MORE() && !SEETWO('\\', '}'))
596 NEXT();
597 REQUIRE(MORE(), REG_EBRACE);
598 SETERROR(REG_BADBR);
599 }
600 } else if (c == '$') /* $ (but not \$) ends it */
601 return(1);
602
603 return(0);
604 }
605
606 /*
607 - p_count - parse a repetition count
608 */
609 static int /* the value */
610 p_count(struct parse *p)
611 {
612 int count = 0;
613 int ndigits = 0;
614
615 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
616 count = count*10 + (GETNEXT() - '0');
617 ndigits++;
618 }
619
620 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
621 return(count);
622 }
623
624 /*
625 - p_bracket - parse a bracketed character list
626 *
627 * Note a significant property of this code: if the allocset() did SETERROR,
628 * no set operations are done.
629 */
630 static void
631 p_bracket(struct parse *p)
632 {
633 cset *cs;
634 int invert = 0;
635
636 /* Dept of Truly Sickening Special-Case Kludges */
637 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
638 EMIT(OBOW, 0);
639 NEXTn(6);
640 return;
641 }
642 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
643 EMIT(OEOW, 0);
644 NEXTn(6);
645 return;
646 }
647
648 if ((cs = allocset(p)) == NULL) {
649 /* allocset did set error status in p */
650 return;
651 }
652
653 if (EAT('^'))
654 invert++; /* make note to invert set at end */
655 if (EAT(']'))
656 CHadd(cs, ']');
657 else if (EAT('-'))
658 CHadd(cs, '-');
659 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
660 p_b_term(p, cs);
661 if (EAT('-'))
662 CHadd(cs, '-');
663 MUSTEAT(']', REG_EBRACK);
664
665 if (p->error != 0) { /* don't mess things up further */
666 freeset(p, cs);
667 return;
668 }
669
670 if (p->g->cflags®_ICASE) {
671 int i;
672 int ci;
673
674 for (i = p->g->csetsize - 1; i >= 0; i--)
675 if (CHIN(cs, i) && isalpha(i)) {
676 ci = othercase(i);
677 if (ci != i)
678 CHadd(cs, ci);
679 }
680 if (cs->multis != NULL)
681 mccase(p, cs);
682 }
683 if (invert) {
684 int i;
685
686 for (i = p->g->csetsize - 1; i >= 0; i--)
687 if (CHIN(cs, i))
688 CHsub(cs, i);
689 else
690 CHadd(cs, i);
691 if (p->g->cflags®_NEWLINE)
692 CHsub(cs, '\n');
693 if (cs->multis != NULL)
694 mcinvert(p, cs);
695 }
696
697 assert(cs->multis == NULL); /* xxx */
698
699 if (nch(p, cs) == 1) { /* optimize singleton sets */
700 ordinary(p, firstch(p, cs));
701 freeset(p, cs);
702 } else
703 EMIT(OANYOF, freezeset(p, cs));
704 }
705
706 /*
707 - p_b_term - parse one term of a bracketed character list
708 */
709 static void
710 p_b_term(struct parse *p, cset *cs)
711 {
712 char c;
713 char start, finish;
714 int i;
715
716 /* classify what we've got */
717 switch ((MORE()) ? PEEK() : '\0') {
718 case '[':
719 c = (MORE2()) ? PEEK2() : '\0';
720 break;
721 case '-':
722 SETERROR(REG_ERANGE);
723 return; /* NOTE RETURN */
724 break;
725 default:
726 c = '\0';
727 break;
728 }
729
730 switch (c) {
731 case ':': /* character class */
732 NEXT2();
733 REQUIRE(MORE(), REG_EBRACK);
734 c = PEEK();
735 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
736 p_b_cclass(p, cs);
737 REQUIRE(MORE(), REG_EBRACK);
738 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
739 break;
740 case '=': /* equivalence class */
741 NEXT2();
742 REQUIRE(MORE(), REG_EBRACK);
743 c = PEEK();
744 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
745 p_b_eclass(p, cs);
746 REQUIRE(MORE(), REG_EBRACK);
747 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
748 break;
749 default: /* symbol, ordinary character, or range */
750 /* xxx revision needed for multichar stuff */
751 start = p_b_symbol(p);
752 if (SEE('-') && MORE2() && PEEK2() != ']') {
753 /* range */
754 NEXT();
755 if (EAT('-'))
756 finish = '-';
757 else
758 finish = p_b_symbol(p);
759 } else
760 finish = start;
761 /* xxx what about signed chars here... */
762 REQUIRE(start <= finish, REG_ERANGE);
763 for (i = start; i <= finish; i++)
764 CHadd(cs, i);
765 break;
766 }
767 }
768
769 /*
770 - p_b_cclass - parse a character-class name and deal with it
771 */
772 static void
773 p_b_cclass(struct parse *p, cset *cs)
774 {
775 char *sp = p->next;
776 struct cclass *cp;
777 size_t len;
778 const char *u;
779 char c;
780
781 while (MORE() && isalpha(PEEK()))
782 NEXT();
783 len = p->next - sp;
784 for (cp = cclasses; cp->name != NULL; cp++)
785 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
786 break;
787 if (cp->name == NULL) {
788 /* oops, didn't find it */
789 SETERROR(REG_ECTYPE);
790 return;
791 }
792
793 u = cp->chars;
794 while ((c = *u++) != '\0')
795 CHadd(cs, c);
796 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
797 MCadd(p, cs, u);
798 }
799
800 /*
801 - p_b_eclass - parse an equivalence-class name and deal with it
802 *
803 * This implementation is incomplete. xxx
804 */
805 static void
806 p_b_eclass(struct parse *p, cset *cs)
807 {
808 char c;
809
810 c = p_b_coll_elem(p, '=');
811 CHadd(cs, c);
812 }
813
814 /*
815 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
816 */
817 static char /* value of symbol */
818 p_b_symbol(struct parse *p)
819 {
820 char value;
821
822 REQUIRE(MORE(), REG_EBRACK);
823 if (!EATTWO('[', '.'))
824 return(GETNEXT());
825
826 /* collating symbol */
827 value = p_b_coll_elem(p, '.');
828 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
829 return(value);
830 }
831
832 /*
833 - p_b_coll_elem - parse a collating-element name and look it up
834 */
835 static char /* value of collating element */
836 p_b_coll_elem(struct parse *p,
837 int endc) /* name ended by endc,']' */
838 {
839 char *sp = p->next;
840 struct cname *cp;
841 int len;
842
843 while (MORE() && !SEETWO(endc, ']'))
844 NEXT();
845 if (!MORE()) {
846 SETERROR(REG_EBRACK);
847 return(0);
848 }
849 len = p->next - sp;
850 for (cp = cnames; cp->name != NULL; cp++)
851 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
852 return(cp->code); /* known name */
853 if (len == 1)
854 return(*sp); /* single character */
855 SETERROR(REG_ECOLLATE); /* neither */
856 return(0);
857 }
858
859 /*
860 - othercase - return the case counterpart of an alphabetic
861 */
862 static char /* if no counterpart, return ch */
863 othercase(int ch)
864 {
865 ch = (uch)ch;
866 assert(isalpha(ch));
867 if (isupper(ch))
868 return ((uch)tolower(ch));
869 else if (islower(ch))
870 return ((uch)toupper(ch));
871 else /* peculiar, but could happen */
872 return(ch);
873 }
874
875 /*
876 - bothcases - emit a dualcase version of a two-case character
877 *
878 * Boy, is this implementation ever a kludge...
879 */
880 static void
881 bothcases(struct parse *p, int ch)
882 {
883 char *oldnext = p->next;
884 char *oldend = p->end;
885 char bracket[3];
886
887 ch = (uch)ch;
888 assert(othercase(ch) != ch); /* p_bracket() would recurse */
889 p->next = bracket;
890 p->end = bracket+2;
891 bracket[0] = ch;
892 bracket[1] = ']';
893 bracket[2] = '\0';
894 p_bracket(p);
895 assert(p->next == bracket+2);
896 p->next = oldnext;
897 p->end = oldend;
898 }
899
900 /*
901 - ordinary - emit an ordinary character
902 */
903 static void
904 ordinary(struct parse *p, int ch)
905 {
906 cat_t *cap = p->g->categories;
907
908 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
909 bothcases(p, ch);
910 else {
911 EMIT(OCHAR, (uch)ch);
912 if (cap[ch] == 0)
913 cap[ch] = p->g->ncategories++;
914 }
915 }
916
917 /*
918 - nonnewline - emit REG_NEWLINE version of OANY
919 *
920 * Boy, is this implementation ever a kludge...
921 */
922 static void
923 nonnewline(struct parse *p)
924 {
925 char *oldnext = p->next;
926 char *oldend = p->end;
927 char bracket[4];
928
929 p->next = bracket;
930 p->end = bracket+3;
931 bracket[0] = '^';
932 bracket[1] = '\n';
933 bracket[2] = ']';
934 bracket[3] = '\0';
935 p_bracket(p);
936 assert(p->next == bracket+3);
937 p->next = oldnext;
938 p->end = oldend;
939 }
940
941 /*
942 - repeat - generate code for a bounded repetition, recursively if needed
943 */
944 static void
945 repeat(struct parse *p,
946 sopno start, /* operand from here to end of strip */
947 int from, /* repeated from this number */
948 int to) /* to this number of times (maybe INFINITY) */
949 {
950 sopno finish = HERE();
951 # define N 2
952 # define INF 3
953 # define REP(f, t) ((f)*8 + (t))
954 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
955 sopno copy;
956
957 if (p->error != 0) /* head off possible runaway recursion */
958 return;
959
960 assert(from <= to);
961
962 switch (REP(MAP(from), MAP(to))) {
963 case REP(0, 0): /* must be user doing this */
964 DROP(finish-start); /* drop the operand */
965 break;
966 case REP(0, 1): /* as x{1,1}? */
967 case REP(0, N): /* as x{1,n}? */
968 case REP(0, INF): /* as x{1,}? */
969 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
970 INSERT(OCH_, start); /* offset is wrong... */
971 repeat(p, start+1, 1, to);
972 ASTERN(OOR1, start);
973 AHEAD(start); /* ... fix it */
974 EMIT(OOR2, 0);
975 AHEAD(THERE());
976 ASTERN(O_CH, THERETHERE());
977 break;
978 case REP(1, 1): /* trivial case */
979 /* done */
980 break;
981 case REP(1, N): /* as x?x{1,n-1} */
982 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
983 INSERT(OCH_, start);
984 ASTERN(OOR1, start);
985 AHEAD(start);
986 EMIT(OOR2, 0); /* offset very wrong... */
987 AHEAD(THERE()); /* ...so fix it */
988 ASTERN(O_CH, THERETHERE());
989 copy = dupl(p, start+1, finish+1);
990 assert(copy == finish+4);
991 repeat(p, copy, 1, to-1);
992 break;
993 case REP(1, INF): /* as x+ */
994 INSERT(OPLUS_, start);
995 ASTERN(O_PLUS, start);
996 break;
997 case REP(N, N): /* as xx{m-1,n-1} */
998 copy = dupl(p, start, finish);
999 repeat(p, copy, from-1, to-1);
1000 break;
1001 case REP(N, INF): /* as xx{n-1,INF} */
1002 copy = dupl(p, start, finish);
1003 repeat(p, copy, from-1, to);
1004 break;
1005 default: /* "can't happen" */
1006 SETERROR(REG_ASSERT); /* just in case */
1007 break;
1008 }
1009 }
1010
1011 /*
1012 - seterr - set an error condition
1013 */
1014 static int /* useless but makes type checking happy */
1015 seterr(struct parse *p, int e)
1016 {
1017 if (p->error == 0) /* keep earliest error condition */
1018 p->error = e;
1019 p->next = nuls; /* try to bring things to a halt */
1020 p->end = nuls;
1021 return(0); /* make the return value well-defined */
1022 }
1023
1024 /*
1025 - allocset - allocate a set of characters for []
1026 */
1027 static cset *
1028 allocset(struct parse *p)
1029 {
1030 int no = p->g->ncsets++;
1031 size_t nc;
1032 size_t nbytes;
1033 cset *cs;
1034 size_t css = (size_t)p->g->csetsize;
1035 int i;
1036
1037 if (no >= p->ncsalloc) { /* need another column of space */
1038 void *ptr;
1039
1040 p->ncsalloc += CHAR_BIT;
1041 nc = p->ncsalloc;
1042 assert(nc % CHAR_BIT == 0);
1043 nbytes = nc / CHAR_BIT * css;
1044
1045 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1046 if (ptr == NULL)
1047 goto nomem;
1048 p->g->sets = ptr;
1049
1050 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1051 if (ptr == NULL)
1052 goto nomem;
1053 p->g->setbits = ptr;
1054
1055 for (i = 0; i < no; i++)
1056 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1057
1058 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1059 }
1060 /* XXX should not happen */
1061 if (p->g->sets == NULL || p->g->setbits == NULL)
1062 goto nomem;
1063
1064 cs = &p->g->sets[no];
1065 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1066 cs->mask = 1 << ((no) % CHAR_BIT);
1067 cs->hash = 0;
1068 cs->smultis = 0;
1069 cs->multis = NULL;
1070
1071 return(cs);
1072 nomem:
1073 free(p->g->sets);
1074 p->g->sets = NULL;
1075 free(p->g->setbits);
1076 p->g->setbits = NULL;
1077
1078 SETERROR(REG_ESPACE);
1079 /* caller's responsibility not to do set ops */
1080 return(NULL);
1081 }
1082
1083 /*
1084 - freeset - free a now-unused set
1085 */
1086 static void
1087 freeset(struct parse *p, cset *cs)
1088 {
1089 size_t i;
1090 cset *top = &p->g->sets[p->g->ncsets];
1091 size_t css = (size_t)p->g->csetsize;
1092
1093 for (i = 0; i < css; i++)
1094 CHsub(cs, i);
1095 if (cs == top-1) /* recover only the easy case */
1096 p->g->ncsets--;
1097 }
1098
1099 /*
1100 - freezeset - final processing on a set of characters
1101 *
1102 * The main task here is merging identical sets. This is usually a waste
1103 * of time (although the hash code minimizes the overhead), but can win
1104 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1105 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1106 * the same value!
1107 */
1108 static int /* set number */
1109 freezeset(struct parse *p, cset *cs)
1110 {
1111 uch h = cs->hash;
1112 size_t i;
1113 cset *top = &p->g->sets[p->g->ncsets];
1114 cset *cs2;
1115 size_t css = (size_t)p->g->csetsize;
1116
1117 /* look for an earlier one which is the same */
1118 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1119 if (cs2->hash == h && cs2 != cs) {
1120 /* maybe */
1121 for (i = 0; i < css; i++)
1122 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1123 break; /* no */
1124 if (i == css)
1125 break; /* yes */
1126 }
1127
1128 if (cs2 < top) { /* found one */
1129 freeset(p, cs);
1130 cs = cs2;
1131 }
1132
1133 return((int)(cs - p->g->sets));
1134 }
1135
1136 /*
1137 - firstch - return first character in a set (which must have at least one)
1138 */
1139 static int /* character; there is no "none" value */
1140 firstch(struct parse *p, cset *cs)
1141 {
1142 size_t i;
1143 size_t css = (size_t)p->g->csetsize;
1144
1145 for (i = 0; i < css; i++)
1146 if (CHIN(cs, i))
1147 return((char)i);
1148 assert(never);
1149 return(0); /* arbitrary */
1150 }
1151
1152 /*
1153 - nch - number of characters in a set
1154 */
1155 static int
1156 nch(struct parse *p, cset *cs)
1157 {
1158 size_t i;
1159 size_t css = (size_t)p->g->csetsize;
1160 int n = 0;
1161
1162 for (i = 0; i < css; i++)
1163 if (CHIN(cs, i))
1164 n++;
1165 return(n);
1166 }
1167
1168 /*
1169 - mcadd - add a collating element to a cset
1170 */
1171 static void
1172 mcadd( struct parse *p, cset *cs, const char *cp)
1173 {
1174 size_t oldend = cs->smultis;
1175 void *np;
1176
1177 cs->smultis += strlen(cp) + 1;
1178 np = realloc(cs->multis, cs->smultis);
1179 if (np == NULL) {
1180 if (cs->multis)
1181 free(cs->multis);
1182 cs->multis = NULL;
1183 SETERROR(REG_ESPACE);
1184 return;
1185 }
1186 cs->multis = np;
1187
1188 llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1189 }
1190
1191 /*
1192 - mcinvert - invert the list of collating elements in a cset
1193 *
1194 * This would have to know the set of possibilities. Implementation
1195 * is deferred.
1196 */
1197 /* ARGSUSED */
1198 static void
1199 mcinvert(struct parse *p, cset *cs)
1200 {
1201 assert(cs->multis == NULL); /* xxx */
1202 }
1203
1204 /*
1205 - mccase - add case counterparts of the list of collating elements in a cset
1206 *
1207 * This would have to know the set of possibilities. Implementation
1208 * is deferred.
1209 */
1210 /* ARGSUSED */
1211 static void
1212 mccase(struct parse *p, cset *cs)
1213 {
1214 assert(cs->multis == NULL); /* xxx */
1215 }
1216
1217 /*
1218 - isinsets - is this character in any sets?
1219 */
1220 static int /* predicate */
1221 isinsets(struct re_guts *g, int c)
1222 {
1223 uch *col;
1224 int i;
1225 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1226 unsigned uc = (uch)c;
1227
1228 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1229 if (col[uc] != 0)
1230 return(1);
1231 return(0);
1232 }
1233
1234 /*
1235 - samesets - are these two characters in exactly the same sets?
1236 */
1237 static int /* predicate */
1238 samesets(struct re_guts *g, int c1, int c2)
1239 {
1240 uch *col;
1241 int i;
1242 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1243 unsigned uc1 = (uch)c1;
1244 unsigned uc2 = (uch)c2;
1245
1246 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1247 if (col[uc1] != col[uc2])
1248 return(0);
1249 return(1);
1250 }
1251
1252 /*
1253 - categorize - sort out character categories
1254 */
1255 static void
1256 categorize(struct parse *p, struct re_guts *g)
1257 {
1258 cat_t *cats = g->categories;
1259 int c;
1260 int c2;
1261 cat_t cat;
1262
1263 /* avoid making error situations worse */
1264 if (p->error != 0)
1265 return;
1266
1267 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1268 if (cats[c] == 0 && isinsets(g, c)) {
1269 cat = g->ncategories++;
1270 cats[c] = cat;
1271 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1272 if (cats[c2] == 0 && samesets(g, c, c2))
1273 cats[c2] = cat;
1274 }
1275 }
1276
1277 /*
1278 - dupl - emit a duplicate of a bunch of sops
1279 */
1280 static sopno /* start of duplicate */
1281 dupl(struct parse *p,
1282 sopno start, /* from here */
1283 sopno finish) /* to this less one */
1284 {
1285 sopno ret = HERE();
1286 sopno len = finish - start;
1287
1288 assert(finish >= start);
1289 if (len == 0)
1290 return(ret);
1291 enlarge(p, p->ssize + len); /* this many unexpected additions */
1292 assert(p->ssize >= p->slen + len);
1293 (void) memmove((char *)(p->strip + p->slen),
1294 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1295 p->slen += len;
1296 return(ret);
1297 }
1298
1299 /*
1300 - doemit - emit a strip operator
1301 *
1302 * It might seem better to implement this as a macro with a function as
1303 * hard-case backup, but it's just too big and messy unless there are
1304 * some changes to the data structures. Maybe later.
1305 */
1306 static void
1307 doemit(struct parse *p, sop op, size_t opnd)
1308 {
1309 /* avoid making error situations worse */
1310 if (p->error != 0)
1311 return;
1312
1313 /* deal with oversize operands ("can't happen", more or less) */
1314 assert(opnd < 1<
1315
1316 /* deal with undersized strip */
1317 if (p->slen >= p->ssize)
1318 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1319 assert(p->slen < p->ssize);
1320
1321 /* finally, it's all reduced to the easy case */
1322 p->strip[p->slen++] = SOP(op, opnd);
1323 }
1324
1325 /*
1326 - doinsert - insert a sop into the strip
1327 */
1328 static void
1329 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1330 {
1331 sopno sn;
1332 sop s;
1333 int i;
1334
1335 /* avoid making error situations worse */
1336 if (p->error != 0)
1337 return;
1338
1339 sn = HERE();
1340 EMIT(op, opnd); /* do checks, ensure space */
1341 assert(HERE() == sn+1);
1342 s = p->strip[sn];
1343
1344 /* adjust paren pointers */
1345 assert(pos > 0);
1346 for (i = 1; i < NPAREN; i++) {
1347 if (p->pbegin[i] >= pos) {
1348 p->pbegin[i]++;
1349 }
1350 if (p->pend[i] >= pos) {
1351 p->pend[i]++;
1352 }
1353 }
1354
1355 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1356 (HERE()-pos-1)*sizeof(sop));
1357 p->strip[pos] = s;
1358 }
1359
1360 /*
1361 - dofwd - complete a forward reference
1362 */
1363 static void
1364 dofwd(struct parse *p, sopno pos, sop value)
1365 {
1366 /* avoid making error situations worse */
1367 if (p->error != 0)
1368 return;
1369
1370 assert(value < 1<
1371 p->strip[pos] = OP(p->strip[pos]) | value;
1372 }
1373
1374 /*
1375 - enlarge - enlarge the strip
1376 */
1377 static void
1378 enlarge(struct parse *p, sopno size)
1379 {
1380 sop *sp;
1381
1382 if (p->ssize >= size)
1383 return;
1384
1385 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1386 if (sp == NULL) {
1387 SETERROR(REG_ESPACE);
1388 return;
1389 }
1390 p->strip = sp;
1391 p->ssize = size;
1392 }
1393
1394 /*
1395 - stripsnug - compact the strip
1396 */
1397 static void
1398 stripsnug(struct parse *p, struct re_guts *g)
1399 {
1400 g->nstates = p->slen;
1401 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1402 if (g->strip == NULL) {
1403 SETERROR(REG_ESPACE);
1404 g->strip = p->strip;
1405 }
1406 }
1407
1408 /*
1409 - findmust - fill in must and mlen with longest mandatory literal string
1410 *
1411 * This algorithm could do fancy things like analyzing the operands of |
1412 * for common subsequences. Someday. This code is simple and finds most
1413 * of the interesting cases.
1414 *
1415 * Note that must and mlen got initialized during setup.
1416 */
1417 static void
1418 findmust(struct parse *p, struct re_guts *g)
1419 {
1420 sop *scan;
1421 sop *start; /* start initialized in the default case, after that */
1422 sop *newstart; /* newstart was initialized in the OCHAR case */
1423 sopno newlen;
1424 sop s;
1425 char *cp;
1426 sopno i;
1427
1428 /* avoid making error situations worse */
1429 if (p->error != 0)
1430 return;
1431
1432 /* find the longest OCHAR sequence in strip */
1433 newlen = 0;
1434 scan = g->strip + 1;
1435 do {
1436 s = *scan++;
1437 switch (OP(s)) {
1438 case OCHAR: /* sequence member */
1439 if (newlen == 0) /* new sequence */
1440 newstart = scan - 1;
1441 newlen++;
1442 break;
1443 case OPLUS_: /* things that don't break one */
1444 case OLPAREN:
1445 case ORPAREN:
1446 break;
1447 case OQUEST_: /* things that must be skipped */
1448 case OCH_:
1449 scan--;
1450 do {
1451 scan += OPND(s);
1452 s = *scan;
1453 /* assert() interferes w debug printouts */
1454 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1455 OP(s) != OOR2) {
1456 g->iflags |= REGEX_BAD;
1457 return;
1458 }
1459 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1460 /* fallthrough */
1461 default: /* things that break a sequence */
1462 if (newlen > g->mlen) { /* ends one */
1463 start = newstart;
1464 g->mlen = newlen;
1465 }
1466 newlen = 0;
1467 break;
1468 }
1469 } while (OP(s) != OEND);
1470
1471 if (g->mlen == 0) /* there isn't one */
1472 return;
1473
1474 /* turn it into a character string */
1475 g->must = malloc((size_t)g->mlen + 1);
1476 if (g->must == NULL) { /* argh; just forget it */
1477 g->mlen = 0;
1478 return;
1479 }
1480 cp = g->must;
1481 scan = start;
1482 for (i = g->mlen; i > 0; i--) {
1483 while (OP(s = *scan++) != OCHAR)
1484 continue;
1485 assert(cp < g->must + g->mlen);
1486 *cp++ = (char)OPND(s);
1487 }
1488 assert(cp == g->must + g->mlen);
1489 *cp++ = '\0'; /* just on general principles */
1490 }
1491
1492 /*
1493 - pluscount - count + nesting
1494 */
1495 static sopno /* nesting depth */
1496 pluscount(struct parse *p, struct re_guts *g)
1497 {
1498 sop *scan;
1499 sop s;
1500 sopno plusnest = 0;
1501 sopno maxnest = 0;
1502
1503 if (p->error != 0)
1504 return(0); /* there may not be an OEND */
1505
1506 scan = g->strip + 1;
1507 do {
1508 s = *scan++;
1509 switch (OP(s)) {
1510 case OPLUS_:
1511 plusnest++;
1512 break;
1513 case O_PLUS:
1514 if (plusnest > maxnest)
1515 maxnest = plusnest;
1516 plusnest--;
1517 break;
1518 }
1519 } while (OP(s) != OEND);
1520 if (plusnest != 0)
1521 g->iflags |= REGEX_BAD;
1522 return(maxnest);
1523 }
0 /*-
1 * This code is derived from OpenBSD's libc/regex, original license follows:
2 *
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)engine.c 8.5 (Berkeley) 3/20/94
35 */
36
37 /*
38 * The matching engine and friends. This file is #included by regexec.c
39 * after suitable #defines of a variety of macros used herein, so that
40 * different state representations can be used without duplicating masses
41 * of code.
42 */
43
44 #ifdef SNAMES
45 #define matcher smatcher
46 #define fast sfast
47 #define slow sslow
48 #define dissect sdissect
49 #define backref sbackref
50 #define step sstep
51 #define print sprint
52 #define at sat
53 #define match smat
54 #define nope snope
55 #endif
56 #ifdef LNAMES
57 #define matcher lmatcher
58 #define fast lfast
59 #define slow lslow
60 #define dissect ldissect
61 #define backref lbackref
62 #define step lstep
63 #define print lprint
64 #define at lat
65 #define match lmat
66 #define nope lnope
67 #endif
68
69 /* another structure passed up and down to avoid zillions of parameters */
70 struct match {
71 struct re_guts *g;
72 int eflags;
73 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
74 char *offp; /* offsets work from here */
75 char *beginp; /* start of string -- virtual NUL precedes */
76 char *endp; /* end of string -- virtual NUL here */
77 char *coldp; /* can be no match starting before here */
78 char **lastpos; /* [nplus+1] */
79 STATEVARS;
80 states st; /* current states */
81 states fresh; /* states for a fresh start */
82 states tmp; /* temporary */
83 states empty; /* empty set of states */
84 };
85
86 static int matcher(struct re_guts *, char *, size_t, llvm_regmatch_t[], int);
87 static char *dissect(struct match *, char *, char *, sopno, sopno);
88 static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int);
89 static char *fast(struct match *, char *, char *, sopno, sopno);
90 static char *slow(struct match *, char *, char *, sopno, sopno);
91 static states step(struct re_guts *, sopno, sopno, states, int, states);
92 #define MAX_RECURSION 100
93 #define BOL (OUT+1)
94 #define EOL (BOL+1)
95 #define BOLEOL (BOL+2)
96 #define NOTHING (BOL+3)
97 #define BOW (BOL+4)
98 #define EOW (BOL+5)
99 #define CODEMAX (BOL+5) /* highest code used */
100 #define NONCHAR(c) ((c) > CHAR_MAX)
101 #define NNONCHAR (CODEMAX-CHAR_MAX)
102 #ifdef REDEBUG
103 static void print(struct match *, char *, states, int, FILE *);
104 #endif
105 #ifdef REDEBUG
106 static void at(struct match *, char *, char *, char *, sopno, sopno);
107 #endif
108 #ifdef REDEBUG
109 static char *pchar(int);
110 #endif
111
112 #ifdef REDEBUG
113 #define SP(t, s, c) print(m, t, s, c, stdout)
114 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
115 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); }
116 static int nope = 0;
117 #else
118 #define SP(t, s, c) /* nothing */
119 #define AT(t, p1, p2, s1, s2) /* nothing */
120 #define NOTE(s) /* nothing */
121 #endif
122
123 /*
124 - matcher - the actual matching engine
125 */
126 static int /* 0 success, REG_NOMATCH failure */
127 matcher(struct re_guts *g, char *string, size_t nmatch, llvm_regmatch_t pmatch[],
128 int eflags)
129 {
130 char *endp;
131 size_t i;
132 struct match mv;
133 struct match *m = &mv;
134 char *dp;
135 const sopno gf = g->firststate+1; /* +1 for OEND */
136 const sopno gl = g->laststate;
137 char *start;
138 char *stop;
139
140 /* simplify the situation where possible */
141 if (g->cflags®_NOSUB)
142 nmatch = 0;
143 if (eflags®_STARTEND) {
144 start = string + pmatch[0].rm_so;
145 stop = string + pmatch[0].rm_eo;
146 } else {
147 start = string;
148 stop = start + strlen(start);
149 }
150 if (stop < start)
151 return(REG_INVARG);
152
153 /* prescreening; this does wonders for this rather slow code */
154 if (g->must != NULL) {
155 for (dp = start; dp < stop; dp++)
156 if (*dp == g->must[0] && stop - dp >= g->mlen &&
157 memcmp(dp, g->must, (size_t)g->mlen) == 0)
158 break;
159 if (dp == stop) /* we didn't find g->must */
160 return(REG_NOMATCH);
161 }
162
163 /* match struct setup */
164 m->g = g;
165 m->eflags = eflags;
166 m->pmatch = NULL;
167 m->lastpos = NULL;
168 m->offp = string;
169 m->beginp = start;
170 m->endp = stop;
171 STATESETUP(m, 4);
172 SETUP(m->st);
173 SETUP(m->fresh);
174 SETUP(m->tmp);
175 SETUP(m->empty);
176 CLEAR(m->empty);
177
178 /* this loop does only one repetition except for backrefs */
179 for (;;) {
180 endp = fast(m, start, stop, gf, gl);
181 if (endp == NULL) { /* a miss */
182 free(m->pmatch);
183 free(m->lastpos);
184 STATETEARDOWN(m);
185 return(REG_NOMATCH);
186 }
187 if (nmatch == 0 && !g->backrefs)
188 break; /* no further info needed */
189
190 /* where? */
191 assert(m->coldp != NULL);
192 for (;;) {
193 NOTE("finding start");
194 endp = slow(m, m->coldp, stop, gf, gl);
195 if (endp != NULL)
196 break;
197 assert(m->coldp < m->endp);
198 m->coldp++;
199 }
200 if (nmatch == 1 && !g->backrefs)
201 break; /* no further info needed */
202
203 /* oh my, he wants the subexpressions... */
204 if (m->pmatch == NULL)
205 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
206 sizeof(llvm_regmatch_t));
207 if (m->pmatch == NULL) {
208 STATETEARDOWN(m);
209 return(REG_ESPACE);
210 }
211 for (i = 1; i <= m->g->nsub; i++)
212 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
213 if (!g->backrefs && !(m->eflags®_BACKR)) {
214 NOTE("dissecting");
215 dp = dissect(m, m->coldp, endp, gf, gl);
216 } else {
217 if (g->nplus > 0 && m->lastpos == NULL)
218 m->lastpos = (char **)malloc((g->nplus+1) *
219 sizeof(char *));
220 if (g->nplus > 0 && m->lastpos == NULL) {
221 free(m->pmatch);
222 STATETEARDOWN(m);
223 return(REG_ESPACE);
224 }
225 NOTE("backref dissect");
226 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
227 }
228 if (dp != NULL)
229 break;
230
231 /* uh-oh... we couldn't find a subexpression-level match */
232 assert(g->backrefs); /* must be back references doing it */
233 assert(g->nplus == 0 || m->lastpos != NULL);
234 for (;;) {
235 if (dp != NULL || endp <= m->coldp)
236 break; /* defeat */
237 NOTE("backoff");
238 endp = slow(m, m->coldp, endp-1, gf, gl);
239 if (endp == NULL)
240 break; /* defeat */
241 /* try it on a shorter possibility */
242 #ifndef NDEBUG
243 for (i = 1; i <= m->g->nsub; i++) {
244 assert(m->pmatch[i].rm_so == -1);
245 assert(m->pmatch[i].rm_eo == -1);
246 }
247 #endif
248 NOTE("backoff dissect");
249 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
250 }
251 assert(dp == NULL || dp == endp);
252 if (dp != NULL) /* found a shorter one */
253 break;
254
255 /* despite initial appearances, there is no match here */
256 NOTE("false alarm");
257 if (m->coldp == stop)
258 break;
259 start = m->coldp + 1; /* recycle starting later */
260 }
261
262 /* fill in the details if requested */
263 if (nmatch > 0) {
264 pmatch[0].rm_so = m->coldp - m->offp;
265 pmatch[0].rm_eo = endp - m->offp;
266 }
267 if (nmatch > 1) {
268 assert(m->pmatch != NULL);
269 for (i = 1; i < nmatch; i++)
270 if (i <= m->g->nsub)
271 pmatch[i] = m->pmatch[i];
272 else {
273 pmatch[i].rm_so = -1;
274 pmatch[i].rm_eo = -1;
275 }
276 }
277
278 if (m->pmatch != NULL)
279 free((char *)m->pmatch);
280 if (m->lastpos != NULL)
281 free((char *)m->lastpos);
282 STATETEARDOWN(m);
283 return(0);
284 }
285
286 /*
287 - dissect - figure out what matched what, no back references
288 */
289 static char * /* == stop (success) always */
290 dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
291 {
292 int i;
293 sopno ss; /* start sop of current subRE */
294 sopno es; /* end sop of current subRE */
295 char *sp; /* start of string matched by it */
296 char *stp; /* string matched by it cannot pass here */
297 char *rest; /* start of rest of string */
298 char *tail; /* string unmatched by rest of RE */
299 sopno ssub; /* start sop of subsubRE */
300 sopno esub; /* end sop of subsubRE */
301 char *ssp; /* start of string matched by subsubRE */
302 char *sep; /* end of string matched by subsubRE */
303 char *oldssp; /* previous ssp */
304 char *dp;
305
306 AT("diss", start, stop, startst, stopst);
307 sp = start;
308 for (ss = startst; ss < stopst; ss = es) {
309 /* identify end of subRE */
310 es = ss;
311 switch (OP(m->g->strip[es])) {
312 case OPLUS_:
313 case OQUEST_:
314 es += OPND(m->g->strip[es]);
315 break;
316 case OCH_:
317 while (OP(m->g->strip[es]) != O_CH)
318 es += OPND(m->g->strip[es]);
319 break;
320 }
321 es++;
322
323 /* figure out what it matched */
324 switch (OP(m->g->strip[ss])) {
325 case OEND:
326 assert(nope);
327 break;
328 case OCHAR:
329 sp++;
330 break;
331 case OBOL:
332 case OEOL:
333 case OBOW:
334 case OEOW:
335 break;
336 case OANY:
337 case OANYOF:
338 sp++;
339 break;
340 case OBACK_:
341 case O_BACK:
342 assert(nope);
343 break;
344 /* cases where length of match is hard to find */
345 case OQUEST_:
346 stp = stop;
347 for (;;) {
348 /* how long could this one be? */
349 rest = slow(m, sp, stp, ss, es);
350 assert(rest != NULL); /* it did match */
351 /* could the rest match the rest? */
352 tail = slow(m, rest, stop, es, stopst);
353 if (tail == stop)
354 break; /* yes! */
355 /* no -- try a shorter match for this one */
356 stp = rest - 1;
357 assert(stp >= sp); /* it did work */
358 }
359 ssub = ss + 1;
360 esub = es - 1;
361 /* did innards match? */
362 if (slow(m, sp, rest, ssub, esub) != NULL) {
363 dp = dissect(m, sp, rest, ssub, esub);
364 assert(dp == rest);
365 } else /* no */
366 assert(sp == rest);
367 sp = rest;
368 break;
369 case OPLUS_:
370 stp = stop;
371 for (;;) {
372 /* how long could this one be? */
373 rest = slow(m, sp, stp, ss, es);
374 assert(rest != NULL); /* it did match */
375 /* could the rest match the rest? */
376 tail = slow(m, rest, stop, es, stopst);
377 if (tail == stop)
378 break; /* yes! */
379 /* no -- try a shorter match for this one */
380 stp = rest - 1;
381 assert(stp >= sp); /* it did work */
382 }
383 ssub = ss + 1;
384 esub = es - 1;
385 ssp = sp;
386 oldssp = ssp;
387 for (;;) { /* find last match of innards */
388 sep = slow(m, ssp, rest, ssub, esub);
389 if (sep == NULL || sep == ssp)
390 break; /* failed or matched null */
391 oldssp = ssp; /* on to next try */
392 ssp = sep;
393 }
394 if (sep == NULL) {
395 /* last successful match */
396 sep = ssp;
397 ssp = oldssp;
398 }
399 assert(sep == rest); /* must exhaust substring */
400 assert(slow(m, ssp, sep, ssub, esub) == rest);
401 dp = dissect(m, ssp, sep, ssub, esub);
402 assert(dp == sep);
403 sp = rest;
404 break;
405 case OCH_:
406 stp = stop;
407 for (;;) {
408 /* how long could this one be? */
409 rest = slow(m, sp, stp, ss, es);
410 assert(rest != NULL); /* it did match */
411 /* could the rest match the rest? */
412 tail = slow(m, rest, stop, es, stopst);
413 if (tail == stop)
414 break; /* yes! */
415 /* no -- try a shorter match for this one */
416 stp = rest - 1;
417 assert(stp >= sp); /* it did work */
418 }
419 ssub = ss + 1;
420 esub = ss + OPND(m->g->strip[ss]) - 1;
421 assert(OP(m->g->strip[esub]) == OOR1);
422 for (;;) { /* find first matching branch */
423 if (slow(m, sp, rest, ssub, esub) == rest)
424 break; /* it matched all of it */
425 /* that one missed, try next one */
426 assert(OP(m->g->strip[esub]) == OOR1);
427 esub++;
428 assert(OP(m->g->strip[esub]) == OOR2);
429 ssub = esub + 1;
430 esub += OPND(m->g->strip[esub]);
431 if (OP(m->g->strip[esub]) == OOR2)
432 esub--;
433 else
434 assert(OP(m->g->strip[esub]) == O_CH);
435 }
436 dp = dissect(m, sp, rest, ssub, esub);
437 assert(dp == rest);
438 sp = rest;
439 break;
440 case O_PLUS:
441 case O_QUEST:
442 case OOR1:
443 case OOR2:
444 case O_CH:
445 assert(nope);
446 break;
447 case OLPAREN:
448 i = OPND(m->g->strip[ss]);
449 assert(0 < i && i <= m->g->nsub);
450 m->pmatch[i].rm_so = sp - m->offp;
451 break;
452 case ORPAREN:
453 i = OPND(m->g->strip[ss]);
454 assert(0 < i && i <= m->g->nsub);
455 m->pmatch[i].rm_eo = sp - m->offp;
456 break;
457 default: /* uh oh */
458 assert(nope);
459 break;
460 }
461 }
462
463 assert(sp == stop);
464 return(sp);
465 }
466
467 /*
468 - backref - figure out what matched what, figuring in back references
469 */
470 static char * /* == stop (success) or NULL (failure) */
471 backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst,
472 sopno lev, int rec) /* PLUS nesting level */
473 {
474 int i;
475 sopno ss; /* start sop of current subRE */
476 char *sp; /* start of string matched by it */
477 sopno ssub; /* start sop of subsubRE */
478 sopno esub; /* end sop of subsubRE */
479 char *ssp; /* start of string matched by subsubRE */
480 char *dp;
481 size_t len;
482 int hard;
483 sop s;
484 llvm_regoff_t offsave;
485 cset *cs;
486
487 AT("back", start, stop, startst, stopst);
488 sp = start;
489
490 /* get as far as we can with easy stuff */
491 hard = 0;
492 for (ss = startst; !hard && ss < stopst; ss++)
493 switch (OP(s = m->g->strip[ss])) {
494 case OCHAR:
495 if (sp == stop || *sp++ != (char)OPND(s))
496 return(NULL);
497 break;
498 case OANY:
499 if (sp == stop)
500 return(NULL);
501 sp++;
502 break;
503 case OANYOF:
504 cs = &m->g->sets[OPND(s)];
505 if (sp == stop || !CHIN(cs, *sp++))
506 return(NULL);
507 break;
508 case OBOL:
509 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
510 (sp < m->endp && *(sp-1) == '\n' &&
511 (m->g->cflags®_NEWLINE)) )
512 { /* yes */ }
513 else
514 return(NULL);
515 break;
516 case OEOL:
517 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
518 (sp < m->endp && *sp == '\n' &&
519 (m->g->cflags®_NEWLINE)) )
520 { /* yes */ }
521 else
522 return(NULL);
523 break;
524 case OBOW:
525 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
526 (sp < m->endp && *(sp-1) == '\n' &&
527 (m->g->cflags®_NEWLINE)) ||
528 (sp > m->beginp &&
529 !ISWORD(*(sp-1))) ) &&
530 (sp < m->endp && ISWORD(*sp)) )
531 { /* yes */ }
532 else
533 return(NULL);
534 break;
535 case OEOW:
536 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
537 (sp < m->endp && *sp == '\n' &&
538 (m->g->cflags®_NEWLINE)) ||
539 (sp < m->endp && !ISWORD(*sp)) ) &&
540 (sp > m->beginp && ISWORD(*(sp-1))) )
541 { /* yes */ }
542 else
543 return(NULL);
544 break;
545 case O_QUEST:
546 break;
547 case OOR1: /* matches null but needs to skip */
548 ss++;
549 s = m->g->strip[ss];
550 do {
551 assert(OP(s) == OOR2);
552 ss += OPND(s);
553 } while (OP(s = m->g->strip[ss]) != O_CH);
554 /* note that the ss++ gets us past the O_CH */
555 break;
556 default: /* have to make a choice */
557 hard = 1;
558 break;
559 }
560 if (!hard) { /* that was it! */
561 if (sp != stop)
562 return(NULL);
563 return(sp);
564 }
565 ss--; /* adjust for the for's final increment */
566
567 /* the hard stuff */
568 AT("hard", sp, stop, ss, stopst);
569 s = m->g->strip[ss];
570 switch (OP(s)) {
571 case OBACK_: /* the vilest depths */
572 i = OPND(s);
573 assert(0 < i && i <= m->g->nsub);
574 if (m->pmatch[i].rm_eo == -1)
575 return(NULL);
576 assert(m->pmatch[i].rm_so != -1);
577 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
578 if (len == 0 && rec++ > MAX_RECURSION)
579 return(NULL);
580 assert(stop - m->beginp >= len);
581 if (sp > stop - len)
582 return(NULL); /* not enough left to match */
583 ssp = m->offp + m->pmatch[i].rm_so;
584 if (memcmp(sp, ssp, len) != 0)
585 return(NULL);
586 while (m->g->strip[ss] != SOP(O_BACK, i))
587 ss++;
588 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
589 break;
590 case OQUEST_: /* to null or not */
591 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
592 if (dp != NULL)
593 return(dp); /* not */
594 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
595 break;
596 case OPLUS_:
597 assert(m->lastpos != NULL);
598 assert(lev+1 <= m->g->nplus);
599 m->lastpos[lev+1] = sp;
600 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
601 break;
602 case O_PLUS:
603 if (sp == m->lastpos[lev]) /* last pass matched null */
604 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
605 /* try another pass */
606 m->lastpos[lev] = sp;
607 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
608 if (dp == NULL)
609 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
610 else
611 return(dp);
612 break;
613 case OCH_: /* find the right one, if any */
614 ssub = ss + 1;
615 esub = ss + OPND(s) - 1;
616 assert(OP(m->g->strip[esub]) == OOR1);
617 for (;;) { /* find first matching branch */
618 dp = backref(m, sp, stop, ssub, esub, lev, rec);
619 if (dp != NULL)
620 return(dp);
621 /* that one missed, try next one */
622 if (OP(m->g->strip[esub]) == O_CH)
623 return(NULL); /* there is none */
624 esub++;
625 assert(OP(m->g->strip[esub]) == OOR2);
626 ssub = esub + 1;
627 esub += OPND(m->g->strip[esub]);
628 if (OP(m->g->strip[esub]) == OOR2)
629 esub--;
630 else
631 assert(OP(m->g->strip[esub]) == O_CH);
632 }
633 break;
634 case OLPAREN: /* must undo assignment if rest fails */
635 i = OPND(s);
636 assert(0 < i && i <= m->g->nsub);
637 offsave = m->pmatch[i].rm_so;
638 m->pmatch[i].rm_so = sp - m->offp;
639 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
640 if (dp != NULL)
641 return(dp);
642 m->pmatch[i].rm_so = offsave;
643 return(NULL);
644 break;
645 case ORPAREN: /* must undo assignment if rest fails */
646 i = OPND(s);
647 assert(0 < i && i <= m->g->nsub);
648 offsave = m->pmatch[i].rm_eo;
649 m->pmatch[i].rm_eo = sp - m->offp;
650 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
651 if (dp != NULL)
652 return(dp);
653 m->pmatch[i].rm_eo = offsave;
654 return(NULL);
655 break;
656 default: /* uh oh */
657 assert(nope);
658 break;
659 }
660
661 /* "can't happen" */
662 assert(nope);
663 /* NOTREACHED */
664 return NULL;
665 }
666
667 /*
668 - fast - step through the string at top speed
669 */
670 static char * /* where tentative match ended, or NULL */
671 fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
672 {
673 states st = m->st;
674 states fresh = m->fresh;
675 states tmp = m->tmp;
676 char *p = start;
677 int c = (start == m->beginp) ? OUT : *(start-1);
678 int lastc; /* previous c */
679 int flagch;
680 int i;
681 char *coldp; /* last p after which no match was underway */
682
683 CLEAR(st);
684 SET1(st, startst);
685 st = step(m->g, startst, stopst, st, NOTHING, st);
686 ASSIGN(fresh, st);
687 SP("start", st, *p);
688 coldp = NULL;
689 for (;;) {
690 /* next character */
691 lastc = c;
692 c = (p == m->endp) ? OUT : *p;
693 if (EQ(st, fresh))
694 coldp = p;
695
696 /* is there an EOL and/or BOL between lastc and c? */
697 flagch = '\0';
698 i = 0;
699 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
700 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
701 flagch = BOL;
702 i = m->g->nbol;
703 }
704 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
705 (c == OUT && !(m->eflags®_NOTEOL)) ) {
706 flagch = (flagch == BOL) ? BOLEOL : EOL;
707 i += m->g->neol;
708 }
709 if (i != 0) {
710 for (; i > 0; i--)
711 st = step(m->g, startst, stopst, st, flagch, st);
712 SP("boleol", st, c);
713 }
714
715 /* how about a word boundary? */
716 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
717 (c != OUT && ISWORD(c)) ) {
718 flagch = BOW;
719 }
720 if ( (lastc != OUT && ISWORD(lastc)) &&
721 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
722 flagch = EOW;
723 }
724 if (flagch == BOW || flagch == EOW) {
725 st = step(m->g, startst, stopst, st, flagch, st);
726 SP("boweow", st, c);
727 }
728
729 /* are we done? */
730 if (ISSET(st, stopst) || p == stop)
731 break; /* NOTE BREAK OUT */
732
733 /* no, we must deal with this character */
734 ASSIGN(tmp, st);
735 ASSIGN(st, fresh);
736 assert(c != OUT);
737 st = step(m->g, startst, stopst, tmp, c, st);
738 SP("aft", st, c);
739 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
740 p++;
741 }
742
743 assert(coldp != NULL);
744 m->coldp = coldp;
745 if (ISSET(st, stopst))
746 return(p+1);
747 else
748 return(NULL);
749 }
750
751 /*
752 - slow - step through the string more deliberately
753 */
754 static char * /* where it ended */
755 slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
756 {
757 states st = m->st;
758 states empty = m->empty;
759 states tmp = m->tmp;
760 char *p = start;
761 int c = (start == m->beginp) ? OUT : *(start-1);
762 int lastc; /* previous c */
763 int flagch;
764 int i;
765 char *matchp; /* last p at which a match ended */
766
767 AT("slow", start, stop, startst, stopst);
768 CLEAR(st);
769 SET1(st, startst);
770 SP("sstart", st, *p);
771 st = step(m->g, startst, stopst, st, NOTHING, st);
772 matchp = NULL;
773 for (;;) {
774 /* next character */
775 lastc = c;
776 c = (p == m->endp) ? OUT : *p;
777
778 /* is there an EOL and/or BOL between lastc and c? */
779 flagch = '\0';
780 i = 0;
781 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
782 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
783 flagch = BOL;
784 i = m->g->nbol;
785 }
786 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
787 (c == OUT && !(m->eflags®_NOTEOL)) ) {
788 flagch = (flagch == BOL) ? BOLEOL : EOL;
789 i += m->g->neol;
790 }
791 if (i != 0) {
792 for (; i > 0; i--)
793 st = step(m->g, startst, stopst, st, flagch, st);
794 SP("sboleol", st, c);
795 }
796
797 /* how about a word boundary? */
798 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
799 (c != OUT && ISWORD(c)) ) {
800 flagch = BOW;
801 }
802 if ( (lastc != OUT && ISWORD(lastc)) &&
803 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
804 flagch = EOW;
805 }
806 if (flagch == BOW || flagch == EOW) {
807 st = step(m->g, startst, stopst, st, flagch, st);
808 SP("sboweow", st, c);
809 }
810
811 /* are we done? */
812 if (ISSET(st, stopst))
813 matchp = p;
814 if (EQ(st, empty) || p == stop)
815 break; /* NOTE BREAK OUT */
816
817 /* no, we must deal with this character */
818 ASSIGN(tmp, st);
819 ASSIGN(st, empty);
820 assert(c != OUT);
821 st = step(m->g, startst, stopst, tmp, c, st);
822 SP("saft", st, c);
823 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
824 p++;
825 }
826
827 return(matchp);
828 }
829
830
831 /*
832 - step - map set of states reachable before char to set reachable after
833 */
834 static states
835 step(struct re_guts *g,
836 sopno start, /* start state within strip */
837 sopno stop, /* state after stop state within strip */
838 states bef, /* states reachable before */
839 int ch, /* character or NONCHAR code */
840 states aft) /* states already known reachable after */
841 {
842 cset *cs;
843 sop s;
844 sopno pc;
845 onestate here; /* note, macros know this name */
846 sopno look;
847 int i;
848
849 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
850 s = g->strip[pc];
851 switch (OP(s)) {
852 case OEND:
853 assert(pc == stop-1);
854 break;
855 case OCHAR:
856 /* only characters can match */
857 assert(!NONCHAR(ch) || ch != (char)OPND(s));
858 if (ch == (char)OPND(s))
859 FWD(aft, bef, 1);
860 break;
861 case OBOL:
862 if (ch == BOL || ch == BOLEOL)
863 FWD(aft, bef, 1);
864 break;
865 case OEOL:
866 if (ch == EOL || ch == BOLEOL)
867 FWD(aft, bef, 1);
868 break;
869 case OBOW:
870 if (ch == BOW)
871 FWD(aft, bef, 1);
872 break;
873 case OEOW:
874 if (ch == EOW)
875 FWD(aft, bef, 1);
876 break;
877 case OANY:
878 if (!NONCHAR(ch))
879 FWD(aft, bef, 1);
880 break;
881 case OANYOF:
882 cs = &g->sets[OPND(s)];
883 if (!NONCHAR(ch) && CHIN(cs, ch))
884 FWD(aft, bef, 1);
885 break;
886 case OBACK_: /* ignored here */
887 case O_BACK:
888 FWD(aft, aft, 1);