mirror of
https://github.com/php/php-src.git
synced 2026-03-29 19:52:20 +02:00
upgrade pcre to version 7.0
This commit is contained in:
1
NEWS
1
NEWS
@@ -2,6 +2,7 @@ PHP NEWS
|
||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||
?? ??? 2007, PHP 5.2.2
|
||||
- Upgraded SQLite 3 to version 3.3.12 (Ilia)
|
||||
- Upgraded PCRE to version 7.0 (Nuno)
|
||||
- Add --ri switch to CLI which allows to check extension information. (Marcus)
|
||||
- Fixed bug #40410 (ext/posix does not compile on MacOS 10.3.9). (Tony)
|
||||
- Fixed bug #39836 (SplObjectStorage empty after unserialize). (Marcus)
|
||||
|
||||
@@ -5,8 +5,8 @@ ARG_WITH("pcre-regex", "Perl Compatible Regular Expressions", "yes");
|
||||
|
||||
if (PHP_PCRE_REGEX == "yes") {
|
||||
EXTENSION("pcre", "php_pcre.c", PHP_PCRE_REGEX_SHARED,
|
||||
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DNO_RECURSE -Iext/pcre/pcrelib");
|
||||
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucp_searchfuncs.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
|
||||
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -DNO_RECURSE -Iext/pcre/pcrelib");
|
||||
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucp_searchfuncs.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
|
||||
ADD_DEF_FILE("ext\\pcre\\php_pcre.def");
|
||||
|
||||
AC_DEFINE('HAVE_BUNDLED_PCRE', 1, 'Using bundled PCRE library');
|
||||
|
||||
@@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,
|
||||
|
||||
if test "$PHP_PCRE_REGEX" != "no"; then
|
||||
if test "$PHP_PCRE_REGEX" = "yes"; then
|
||||
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib)
|
||||
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib)
|
||||
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
|
||||
PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/])
|
||||
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
|
||||
|
||||
@@ -4,7 +4,7 @@ PCRE LICENCE
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
|
||||
@@ -1,6 +1,279 @@
|
||||
ChangeLog for PCRE
|
||||
------------------
|
||||
|
||||
Version 7.0 19-Dec-06
|
||||
---------------------
|
||||
|
||||
1. Fixed a signed/unsigned compiler warning in pcre_compile.c, shown up by
|
||||
moving to gcc 4.1.1.
|
||||
|
||||
2. The -S option for pcretest uses setrlimit(); I had omitted to #include
|
||||
sys/time.h, which is documented as needed for this function. It doesn't
|
||||
seem to matter on Linux, but it showed up on some releases of OS X.
|
||||
|
||||
3. It seems that there are systems where bytes whose values are greater than
|
||||
127 match isprint() in the "C" locale. The "C" locale should be the
|
||||
default when a C program starts up. In most systems, only ASCII printing
|
||||
characters match isprint(). This difference caused the output from pcretest
|
||||
to vary, making some of the tests fail. I have changed pcretest so that:
|
||||
|
||||
(a) When it is outputting text in the compiled version of a pattern, bytes
|
||||
other than 32-126 are always shown as hex escapes.
|
||||
|
||||
(b) When it is outputting text that is a matched part of a subject string,
|
||||
it does the same, unless a different locale has been set for the match
|
||||
(using the /L modifier). In this case, it uses isprint() to decide.
|
||||
|
||||
4. Fixed a major bug that caused incorrect computation of the amount of memory
|
||||
required for a compiled pattern when options that changed within the
|
||||
pattern affected the logic of the preliminary scan that determines the
|
||||
length. The relevant options are -x, and -i in UTF-8 mode. The result was
|
||||
that the computed length was too small. The symptoms of this bug were
|
||||
either the PCRE error "internal error: code overflow" from pcre_compile(),
|
||||
or a glibc crash with a message such as "pcretest: free(): invalid next
|
||||
size (fast)". Examples of patterns that provoked this bug (shown in
|
||||
pcretest format) are:
|
||||
|
||||
/(?-x: )/x
|
||||
/(?x)(?-x: \s*#\s*)/
|
||||
/((?i)[\x{c0}])/8
|
||||
/(?i:[\x{c0}])/8
|
||||
|
||||
HOWEVER: Change 17 below makes this fix obsolete as the memory computation
|
||||
is now done differently.
|
||||
|
||||
5. Applied patches from Google to: (a) add a QuoteMeta function to the C++
|
||||
wrapper classes; (b) implement a new function in the C++ scanner that is
|
||||
more efficient than the old way of doing things because it avoids levels of
|
||||
recursion in the regex matching; (c) add a paragraph to the documentation
|
||||
for the FullMatch() function.
|
||||
|
||||
6. The escape sequence \n was being treated as whatever was defined as
|
||||
"newline". Not only was this contrary to the documentation, which states
|
||||
that \n is character 10 (hex 0A), but it also went horribly wrong when
|
||||
"newline" was defined as CRLF. This has been fixed.
|
||||
|
||||
7. In pcre_dfa_exec.c the value of an unsigned integer (the variable called c)
|
||||
was being set to -1 for the "end of line" case (supposedly a value that no
|
||||
character can have). Though this value is never used (the check for end of
|
||||
line is "zero bytes in current character"), it caused compiler complaints.
|
||||
I've changed it to 0xffffffff.
|
||||
|
||||
8. In pcre_version.c, the version string was being built by a sequence of
|
||||
C macros that, in the event of PCRE_PRERELEASE being defined as an empty
|
||||
string (as it is for production releases) called a macro with an empty
|
||||
argument. The C standard says the result of this is undefined. The gcc
|
||||
compiler treats it as an empty string (which was what was wanted) but it is
|
||||
reported that Visual C gives an error. The source has been hacked around to
|
||||
avoid this problem.
|
||||
|
||||
9. On the advice of a Windows user, included <io.h> and <fcntl.h> in Windows
|
||||
builds of pcretest, and changed the call to _setmode() to use _O_BINARY
|
||||
instead of 0x8000. Made all the #ifdefs test both _WIN32 and WIN32 (not all
|
||||
of them did).
|
||||
|
||||
10. Originally, pcretest opened its input and output without "b"; then I was
|
||||
told that "b" was needed in some environments, so it was added for release
|
||||
5.0 to both the input and output. (It makes no difference on Unix-like
|
||||
systems.) Later I was told that it is wrong for the input on Windows. I've
|
||||
now abstracted the modes into two macros, to make it easier to fiddle with
|
||||
them, and removed "b" from the input mode under Windows.
|
||||
|
||||
11. Added pkgconfig support for the C++ wrapper library, libpcrecpp.
|
||||
|
||||
12. Added -help and --help to pcretest as an official way of being reminded
|
||||
of the options.
|
||||
|
||||
13. Removed some redundant semicolons after macro calls in pcrecpparg.h.in
|
||||
and pcrecpp.cc because they annoy compilers at high warning levels.
|
||||
|
||||
14. A bit of tidying/refactoring in pcre_exec.c in the main bumpalong loop.
|
||||
|
||||
15. Fixed an occurrence of == in configure.ac that should have been = (shell
|
||||
scripts are not C programs :-) and which was not noticed because it works
|
||||
on Linux.
|
||||
|
||||
16. pcretest is supposed to handle any length of pattern and data line (as one
|
||||
line or as a continued sequence of lines) by extending its input buffer if
|
||||
necessary. This feature was broken for very long pattern lines, leading to
|
||||
a string of junk being passed to pcre_compile() if the pattern was longer
|
||||
than about 50K.
|
||||
|
||||
17. I have done a major re-factoring of the way pcre_compile() computes the
|
||||
amount of memory needed for a compiled pattern. Previously, there was code
|
||||
that made a preliminary scan of the pattern in order to do this. That was
|
||||
OK when PCRE was new, but as the facilities have expanded, it has become
|
||||
harder and harder to keep it in step with the real compile phase, and there
|
||||
have been a number of bugs (see for example, 4 above). I have now found a
|
||||
cunning way of running the real compile function in a "fake" mode that
|
||||
enables it to compute how much memory it would need, while actually only
|
||||
ever using a few hundred bytes of working memory and without too many
|
||||
tests of the mode. This should make future maintenance and development
|
||||
easier. A side effect of this work is that the limit of 200 on the nesting
|
||||
depth of parentheses has been removed (though this was never a serious
|
||||
limitation, I suspect). However, there is a downside: pcre_compile() now
|
||||
runs more slowly than before (30% or more, depending on the pattern). I
|
||||
hope this isn't a big issue. There is no effect on runtime performance.
|
||||
|
||||
18. Fixed a minor bug in pcretest: if a pattern line was not terminated by a
|
||||
newline (only possible for the last line of a file) and it was a
|
||||
pattern that set a locale (followed by /Lsomething), pcretest crashed.
|
||||
|
||||
19. Added additional timing features to pcretest. (1) The -tm option now times
|
||||
matching only, not compiling. (2) Both -t and -tm can be followed, as a
|
||||
separate command line item, by a number that specifies the number of
|
||||
repeats to use when timing. The default is 50000; this gives better
|
||||
precision, but takes uncomfortably long for very large patterns.
|
||||
|
||||
20. Extended pcre_study() to be more clever in cases where a branch of a
|
||||
subpattern has no definite first character. For example, (a*|b*)[cd] would
|
||||
previously give no result from pcre_study(). Now it recognizes that the
|
||||
first character must be a, b, c, or d.
|
||||
|
||||
21. There was an incorrect error "recursive call could loop indefinitely" if
|
||||
a subpattern (or the entire pattern) that was being tested for matching an
|
||||
empty string contained only one non-empty item after a nested subpattern.
|
||||
For example, the pattern (?>\x{100}*)\d(?R) provoked this error
|
||||
incorrectly, because the \d was being skipped in the check.
|
||||
|
||||
22. The pcretest program now has a new pattern option /B and a command line
|
||||
option -b, which is equivalent to adding /B to every pattern. This causes
|
||||
it to show the compiled bytecode, without the additional information that
|
||||
-d shows. The effect of -d is now the same as -b with -i (and similarly, /D
|
||||
is the same as /B/I).
|
||||
|
||||
23. A new optimization is now able automatically to treat some sequences such
|
||||
as a*b as a*+b. More specifically, if something simple (such as a character
|
||||
or a simple class like \d) has an unlimited quantifier, and is followed by
|
||||
something that cannot possibly match the quantified thing, the quantifier
|
||||
is automatically "possessified".
|
||||
|
||||
24. A recursive reference to a subpattern whose number was greater than 39
|
||||
went wrong under certain circumstances in UTF-8 mode. This bug could also
|
||||
have affected the operation of pcre_study().
|
||||
|
||||
25. Realized that a little bit of performance could be had by replacing
|
||||
(c & 0xc0) == 0xc0 with c >= 0xc0 when processing UTF-8 characters.
|
||||
|
||||
26. Timing data from pcretest is now shown to 4 decimal places instead of 3.
|
||||
|
||||
27. Possessive quantifiers such as a++ were previously implemented by turning
|
||||
them into atomic groups such as ($>a+). Now they have their own opcodes,
|
||||
which improves performance. This includes the automatically created ones
|
||||
from 23 above.
|
||||
|
||||
28. A pattern such as (?=(\w+))\1: which simulates an atomic group using a
|
||||
lookahead was broken if it was not anchored. PCRE was mistakenly expecting
|
||||
the first matched character to be a colon. This applied both to named and
|
||||
numbered groups.
|
||||
|
||||
29. The ucpinternal.h header file was missing its idempotency #ifdef.
|
||||
|
||||
30. I was sent a "project" file called libpcre.a.dev which I understand makes
|
||||
building PCRE on Windows easier, so I have included it in the distribution.
|
||||
|
||||
31. There is now a check in pcretest against a ridiculously large number being
|
||||
returned by pcre_exec() or pcre_dfa_exec(). If this happens in a /g or /G
|
||||
loop, the loop is abandoned.
|
||||
|
||||
32. Forward references to subpatterns in conditions such as (?(2)...) where
|
||||
subpattern 2 is defined later cause pcre_compile() to search forwards in
|
||||
the pattern for the relevant set of parentheses. This search went wrong
|
||||
when there were unescaped parentheses in a character class, parentheses
|
||||
escaped with \Q...\E, or parentheses in a #-comment in /x mode.
|
||||
|
||||
33. "Subroutine" calls and backreferences were previously restricted to
|
||||
referencing subpatterns earlier in the regex. This restriction has now
|
||||
been removed.
|
||||
|
||||
34. Added a number of extra features that are going to be in Perl 5.10. On the
|
||||
whole, these are just syntactic alternatives for features that PCRE had
|
||||
previously implemented using the Python syntax or my own invention. The
|
||||
other formats are all retained for compatibility.
|
||||
|
||||
(a) Named groups can now be defined as (?<name>...) or (?'name'...) as well
|
||||
as (?P<name>...). The new forms, as well as being in Perl 5.10, are
|
||||
also .NET compatible.
|
||||
|
||||
(b) A recursion or subroutine call to a named group can now be defined as
|
||||
(?&name) as well as (?P>name).
|
||||
|
||||
(c) A backreference to a named group can now be defined as \k<name> or
|
||||
\k'name' as well as (?P=name). The new forms, as well as being in Perl
|
||||
5.10, are also .NET compatible.
|
||||
|
||||
(d) A conditional reference to a named group can now use the syntax
|
||||
(?(<name>) or (?('name') as well as (?(name).
|
||||
|
||||
(e) A "conditional group" of the form (?(DEFINE)...) can be used to define
|
||||
groups (named and numbered) that are never evaluated inline, but can be
|
||||
called as "subroutines" from elsewhere. In effect, the DEFINE condition
|
||||
is always false. There may be only one alternative in such a group.
|
||||
|
||||
(f) A test for recursion can be given as (?(R1).. or (?(R&name)... as well
|
||||
as the simple (?(R). The condition is true only if the most recent
|
||||
recursion is that of the given number or name. It does not search out
|
||||
through the entire recursion stack.
|
||||
|
||||
(g) The escape \gN or \g{N} has been added, where N is a positive or
|
||||
negative number, specifying an absolute or relative reference.
|
||||
|
||||
35. Tidied to get rid of some further signed/unsigned compiler warnings and
|
||||
some "unreachable code" warnings.
|
||||
|
||||
36. Updated the Unicode property tables to Unicode version 5.0.0. Amongst other
|
||||
things, this adds five new scripts.
|
||||
|
||||
37. Perl ignores orphaned \E escapes completely. PCRE now does the same.
|
||||
There were also incompatibilities regarding the handling of \Q..\E inside
|
||||
character classes, for example with patterns like [\Qa\E-\Qz\E] where the
|
||||
hyphen was adjacent to \Q or \E. I hope I've cleared all this up now.
|
||||
|
||||
38. Like Perl, PCRE detects when an indefinitely repeated parenthesized group
|
||||
matches an empty string, and forcibly breaks the loop. There were bugs in
|
||||
this code in non-simple cases. For a pattern such as ^(a()*)* matched
|
||||
against aaaa the result was just "a" rather than "aaaa", for example. Two
|
||||
separate and independent bugs (that affected different cases) have been
|
||||
fixed.
|
||||
|
||||
39. Refactored the code to abolish the use of different opcodes for small
|
||||
capturing bracket numbers. This is a tidy that I avoided doing when I
|
||||
removed the limit on the number of capturing brackets for 3.5 back in 2001.
|
||||
The new approach is not only tidier, it makes it possible to reduce the
|
||||
memory needed to fix the previous bug (38).
|
||||
|
||||
40. Implemented PCRE_NEWLINE_ANY to recognize any of the Unicode newline
|
||||
sequences (http://unicode.org/unicode/reports/tr18/) as "newline" when
|
||||
processing dot, circumflex, or dollar metacharacters, or #-comments in /x
|
||||
mode.
|
||||
|
||||
41. Add \R to match any Unicode newline sequence, as suggested in the Unicode
|
||||
report.
|
||||
|
||||
42. Applied patch, originally from Ari Pollak, modified by Google, to allow
|
||||
copy construction and assignment in the C++ wrapper.
|
||||
|
||||
43. Updated pcregrep to support "--newline=any". In the process, I fixed a
|
||||
couple of bugs that could have given wrong results in the "--newline=crlf"
|
||||
case.
|
||||
|
||||
44. Added a number of casts and did some reorganization of signed/unsigned int
|
||||
variables following suggestions from Dair Grant. Also renamed the variable
|
||||
"this" as "item" because it is a C++ keyword.
|
||||
|
||||
45. Arranged for dftables to add
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
to pcre_chartables.c because without it, gcc 4.x may remove the array
|
||||
definition from the final binary if PCRE is built into a static library and
|
||||
dead code stripping is activated.
|
||||
|
||||
46. For an unanchored pattern, if a match attempt fails at the start of a
|
||||
newline sequence, and the newline setting is CRLF or ANY, and the next two
|
||||
characters are CRLF, advance by two characters instead of one.
|
||||
|
||||
|
||||
Version 6.7 04-Jul-06
|
||||
---------------------
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ PCRE LICENCE
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
|
||||
@@ -1,6 +1,36 @@
|
||||
News about PCRE releases
|
||||
------------------------
|
||||
|
||||
Release 7.0 23-Nov-06
|
||||
---------------------
|
||||
|
||||
This release has a new major number because there have been some internal
|
||||
upheavals to facilitate the addition of new optimizations and other facilities,
|
||||
and to make subsequent maintenance and extension easier. Compilation is likely
|
||||
to be a bit slower, but there should be no major effect on runtime performance.
|
||||
Previously compiled patterns are NOT upwards compatible with this release. If
|
||||
you have saved compiled patterns from a previous release, you will have to
|
||||
re-compile them. Important changes that are visible to users are:
|
||||
|
||||
1. The Unicode property tables have been updated to Unicode 5.0.0, which adds
|
||||
some more scripts.
|
||||
|
||||
2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline
|
||||
sequence as a newline.
|
||||
|
||||
3. The \R escape matches a single Unicode newline sequence as a single unit.
|
||||
|
||||
4. New features that will appear in Perl 5.10 are now in PCRE. These include
|
||||
alternative Perl syntax for named parentheses, and Perl syntax for
|
||||
recursion.
|
||||
|
||||
5. The C++ wrapper interface has been extended by the addition of a
|
||||
QuoteMeta function and the ability to allow copy construction and
|
||||
assignment.
|
||||
|
||||
For a complete list of changes, see the ChangeLog file.
|
||||
|
||||
|
||||
Release 6.7 04-Jul-06
|
||||
---------------------
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ The following are generic comments about building PCRE. The interspersed
|
||||
indented commands are suggestions from Mark Tetrode as to which commands you
|
||||
might use on a Windows system to build a static library.
|
||||
|
||||
(1) Copy or rename the file config.in as config.h, and change the macros that
|
||||
(1) Copy or rename the file config.h.in as config.h, and change the macros that
|
||||
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
|
||||
Unfortunately, because of the way Unix autoconf works, the default setting has
|
||||
to be 0. You may also want to make changes to other macros in config.h. In
|
||||
@@ -31,7 +31,7 @@ the NEWLINE macro. The default is to use '\n', thereby using whatever value
|
||||
your compiler gives to '\n'.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
copy config.in config.h
|
||||
copy config.h.in config.h
|
||||
rem Use write, because notepad cannot handle UNIX files. Change values.
|
||||
write config.h
|
||||
|
||||
@@ -56,6 +56,7 @@ character tables and writes them to that file.
|
||||
pcre_globals.c
|
||||
pcre_info.c
|
||||
pcre_maketables.c
|
||||
pcre_newline.c
|
||||
pcre_ord2utf8.c
|
||||
pcre_refcount.c
|
||||
pcre_study.c
|
||||
@@ -93,10 +94,10 @@ pcre and pcreposix libraries when linking.
|
||||
cl /F0x400000 pcretest.c pcre.lib pcreposix.lib
|
||||
|
||||
(6) Run pcretest on the testinput files in the testdata directory, and check
|
||||
that the output matches the corresponding testoutput files. You must use the
|
||||
-i option when checking testinput2. Note that the supplied files are in Unix
|
||||
format, with just LF characters as line terminators. You may need to edit them
|
||||
to change this if your system uses a different convention.
|
||||
that the output matches the corresponding testoutput files. Note that the
|
||||
supplied files are in Unix format, with just LF characters as line terminators.
|
||||
You may need to edit them to change this if your system uses a different
|
||||
convention.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
pcretest testdata\testinput1 testdata\myoutput1
|
||||
@@ -135,6 +136,17 @@ If you have a system without "configure" but where you can use a Makefile, edit
|
||||
Makefile.in to create Makefile, substituting suitable values for the variables
|
||||
at the head of the file.
|
||||
|
||||
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
|
||||
|
||||
Some of the core BCC libraries have a version of PCRE from 1998 built in,
|
||||
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
|
||||
version mismatch. I'm including an easy workaround below, if you'd like to
|
||||
include it in the non-unix instructions:
|
||||
|
||||
When linking a project with BCC5.5, pcre.lib must be included before any of
|
||||
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
|
||||
line.
|
||||
|
||||
Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
|
||||
contributed by Paul Sokolovsky. These environments are Mingw32
|
||||
(http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin
|
||||
|
||||
@@ -118,13 +118,13 @@ library. You can read more about them in the pcrebuild man page.
|
||||
property table); only the basic two-letter properties such as Lu are
|
||||
supported.
|
||||
|
||||
. You can build PCRE to recognize either CR or LF or the sequence CRLF as
|
||||
indicating the end of a line. Whatever you specify at build time is the
|
||||
default; the caller of PCRE can change the selection at run time. The default
|
||||
newline indicator is a single LF character (the Unix standard). You can
|
||||
specify the default newline indicator by adding --newline-is-cr or
|
||||
--newline-is-lf or --newline-is-crlf to the "configure" command,
|
||||
respectively.
|
||||
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
|
||||
of the Unicode newline sequences as indicating the end of a line. Whatever
|
||||
you specify at build time is the default; the caller of PCRE can change the
|
||||
selection at run time. The default newline indicator is a single LF character
|
||||
(the Unix standard). You can specify the default newline indicator by adding
|
||||
--newline-is-cr or --newline-is-lf or --newline-is-crlf or --newline-is-any
|
||||
to the "configure" command, respectively.
|
||||
|
||||
. When called via the POSIX interface, PCRE uses malloc() to get additional
|
||||
storage for processing capturing parentheses if there are more than 10 of
|
||||
@@ -283,7 +283,7 @@ to the values of CC and CFLAGS.
|
||||
Using HP's ANSI C++ compiler (aCC)
|
||||
----------------------------------
|
||||
|
||||
Unless C++ support is disabled by specifiying the "--disable-cpp" option of the
|
||||
Unless C++ support is disabled by specifying the "--disable-cpp" option of the
|
||||
"configure" script, you *must* include the "-AA" option in the CXXFLAGS
|
||||
environment variable in order for the C++ components to compile correctly.
|
||||
|
||||
@@ -305,8 +305,8 @@ PCRE in the same way as for Unix systems.
|
||||
|
||||
PCRE has been compiled on Windows systems and on Macintoshes, but I don't know
|
||||
the details because I don't use those systems. It should be straightforward to
|
||||
build PCRE on any system that has a Standard C compiler, because it uses only
|
||||
Standard C functions.
|
||||
build PCRE on any system that has a Standard C compiler and library, because it
|
||||
uses only Standard C functions.
|
||||
|
||||
|
||||
Testing PCRE
|
||||
@@ -325,15 +325,15 @@ NON-UNIX-USE.
|
||||
The RunTest script runs the pcretest test program (which is documented in its
|
||||
own man page) on each of the testinput files (in the testdata directory) in
|
||||
turn, and compares the output with the contents of the corresponding testoutput
|
||||
file. A file called testtry is used to hold the main output from pcretest
|
||||
files. A file called testtry is used to hold the main output from pcretest
|
||||
(testsavedregex is also used as a working file). To run pcretest on just one of
|
||||
the test files, give its number as an argument to RunTest, for example:
|
||||
|
||||
RunTest 2
|
||||
|
||||
The first file can also be fed directly into the perltest script to check that
|
||||
Perl gives the same results. The only difference you should see is in the first
|
||||
few lines, where the Perl version is given instead of the PCRE version.
|
||||
The first test file can also be fed directly into the perltest script to check
|
||||
that Perl gives the same results. The only difference you should see is in the
|
||||
first few lines, where the Perl version is given instead of the PCRE version.
|
||||
|
||||
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
|
||||
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
|
||||
@@ -442,6 +442,7 @@ The distribution should contain the following files:
|
||||
pcre_globals.c ) and some internal functions that they use
|
||||
pcre_info.c )
|
||||
pcre_maketables.c )
|
||||
pcre_newline.c )
|
||||
pcre_ord2utf8.c )
|
||||
pcre_refcount.c )
|
||||
pcre_study.c )
|
||||
@@ -525,4 +526,4 @@ The distribution should contain the following files:
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
June 2006
|
||||
November 2006
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -86,7 +86,16 @@ fprintf(f,
|
||||
fprintf(f,
|
||||
"This file contains the default tables for characters with codes less than\n"
|
||||
"128 (ASCII characters). These tables are used when no external tables are\n"
|
||||
"passed to PCRE. */\n\n"
|
||||
"passed to PCRE.\n\n");
|
||||
fprintf(f,
|
||||
"The following #include is present because without it gcc 4.x may remove\n"
|
||||
"the array definition from the final binary if PCRE is built into a static\n"
|
||||
"library and dead code stripping is activated. This leads to link errors.\n"
|
||||
"Pulling in the header ensures that the array gets flagged as \"someone\n"
|
||||
"outside this compilation unit might reference this\" and so it will always\n"
|
||||
"be supplied to the linker. */\n\n"
|
||||
"#include \"pcre_internal.h\"\n\n");
|
||||
fprintf(f,
|
||||
"const unsigned char _pcre_default_tables[] = {\n\n"
|
||||
"/* This table is a lower casing table. */\n\n");
|
||||
|
||||
|
||||
@@ -16,10 +16,11 @@ not operate by backtracking, as the original Henry Spencer code and current
|
||||
Perl code does, but instead checked all possibilities simultaneously by keeping
|
||||
a list of current states and checking all of them as it advanced through the
|
||||
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
|
||||
algorithm". When the pattern was all used up, all remaining states were
|
||||
possible matches, and the one matching the longest subset of the subject string
|
||||
was chosen. This did not necessarily maximize the individual wild portions of
|
||||
the pattern, as is expected in Unix and Perl-style regular expressions.
|
||||
algorithm", though it was not a traditional Finite State Machine (FSM). When
|
||||
the pattern was all used up, all remaining states were possible matches, and
|
||||
the one matching the longest subset of the subject string was chosen. This did
|
||||
not necessarily maximize the individual wild portions of the pattern, as is
|
||||
expected in Unix and Perl-style regular expressions.
|
||||
|
||||
Historical note 2
|
||||
-----------------
|
||||
@@ -41,14 +42,38 @@ unrelated to those mentioned above), I tried at first to invent an algorithm
|
||||
that used an amount of store bounded by a multiple of the number of characters
|
||||
in the pattern, to save on compiling time. However, because of the greater
|
||||
complexity in Perl regular expressions, I couldn't do this. In any case, a
|
||||
first pass through the pattern is needed, for a number of reasons. PCRE works
|
||||
by running a very degenerate first pass to calculate a maximum store size, and
|
||||
then a second pass to do the real compile - which may use a bit less than the
|
||||
predicted amount of store. The idea is that this is going to turn out faster
|
||||
because the first pass is degenerate and the second pass can just store stuff
|
||||
straight into the vector, which it knows is big enough. It does make the
|
||||
compiling functions bigger, of course, but they have become quite big anyway to
|
||||
handle all the Perl stuff.
|
||||
first pass through the pattern is helpful for other reasons.
|
||||
|
||||
Computing the memory requirement: how it was
|
||||
--------------------------------------------
|
||||
|
||||
Up to and including release 6.7, PCRE worked by running a very degenerate first
|
||||
pass to calculate a maximum store size, and then a second pass to do the real
|
||||
compile - which might use a bit less than the predicted amount of memory. The
|
||||
idea was that this would turn out faster than the Henry Spencer code because
|
||||
the first pass is degenerate and the second pass can just store stuff straight
|
||||
into the vector, which it knows is big enough.
|
||||
|
||||
Computing the memory requirement: how it is
|
||||
-------------------------------------------
|
||||
|
||||
By the time I was working on a potential 6.8 release, the degenerate first pass
|
||||
had become very complicated and hard to maintain. Indeed one of the early
|
||||
things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
|
||||
I had a flash of inspiration as to how I could run the real compile function in
|
||||
a "fake" mode that enables it to compute how much memory it would need, while
|
||||
actually only ever using a few hundred bytes of working memory, and without too
|
||||
many tests of the mode that might slow it down. So I re-factored the compiling
|
||||
functions to work this way. This got rid of about 600 lines of source. It
|
||||
should make future maintenance and development easier. As this was such a major
|
||||
change, I never released 6.8, instead upping the number to 7.0 (other quite
|
||||
major changes are also present in the 7.0 release).
|
||||
|
||||
A side effect of this work is that the previous limit of 200 on the nesting
|
||||
depth of parentheses was removed. However, there is a downside: pcre_compile()
|
||||
runs more slowly than before (30% or more, depending on the pattern) because it
|
||||
is doing a full analysis of the pattern. My hope is that this is not a big
|
||||
issue.
|
||||
|
||||
Traditional matching function
|
||||
-----------------------------
|
||||
@@ -70,6 +95,12 @@ intreprets the same compiled pattern data as pcre_exec(); however, not all the
|
||||
facilities are available, and those that are do not always work in quite the
|
||||
same way. See the user documentation for details.
|
||||
|
||||
The algorithm that is used for pcre_dfa_exec() is not a traditional FSM,
|
||||
because it may have a number of states active at one time. More work would be
|
||||
needed at compile time to produce a traditional FSM where only one state is
|
||||
ever active at once. I believe some other regex matchers work this way.
|
||||
|
||||
|
||||
Format of compiled patterns
|
||||
---------------------------
|
||||
|
||||
@@ -79,10 +110,12 @@ item is either implicit in the opcode or contained in the data bytes that
|
||||
follow it.
|
||||
|
||||
In many cases below "two-byte" data values are specified. This is in fact just
|
||||
a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
|
||||
a default when the number is an offset within the compiled pattern. PCRE can be
|
||||
compiled to use 3-byte or 4-byte values for these offsets (impairing the
|
||||
performance). This is necessary only when patterns whose compiled length is
|
||||
greater than 64K are going to be processed. In this description, we assume the
|
||||
"normal" compilation options.
|
||||
greater than 64K are going to be processed. In this description, we assume the
|
||||
"normal" compilation options. "Two-byte" data values that are counts (e.g. for
|
||||
quantifiers) are always just two bytes.
|
||||
|
||||
A list of all the opcodes follows:
|
||||
|
||||
@@ -109,6 +142,7 @@ These items are all just one byte long
|
||||
OP_EOD match end of data: \z
|
||||
OP_DOLL $ (end of data, or before \n in multiline)
|
||||
OP_EXTUNI match an extended Unicode character
|
||||
OP_ANYNL match any Unicode newline sequence
|
||||
|
||||
|
||||
Repeating single characters
|
||||
@@ -119,23 +153,28 @@ following opcodes:
|
||||
|
||||
OP_STAR
|
||||
OP_MINSTAR
|
||||
OP_POSSTAR
|
||||
OP_PLUS
|
||||
OP_MINPLUS
|
||||
OP_POSPLUS
|
||||
OP_QUERY
|
||||
OP_MINQUERY
|
||||
OP_POSQUERY
|
||||
|
||||
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
|
||||
Those with "MIN" in their name are the minimizing versions. Each is followed by
|
||||
the character that is to be repeated. Other repeats make use of
|
||||
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
|
||||
their names are possessive versions. Each is followed by the character that is
|
||||
to be repeated. Other repeats make use of
|
||||
|
||||
OP_UPTO
|
||||
OP_MINUPTO
|
||||
OP_POSUPTO
|
||||
OP_EXACT
|
||||
|
||||
which are followed by a two-byte count (most significant first) and the
|
||||
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
|
||||
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
|
||||
OP_UPTO (or OP_MINUPTO).
|
||||
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
|
||||
|
||||
|
||||
Repeating character types
|
||||
@@ -147,12 +186,16 @@ byte. The opcodes are:
|
||||
|
||||
OP_TYPESTAR
|
||||
OP_TYPEMINSTAR
|
||||
OP_TYPEPOSSTAR
|
||||
OP_TYPEPLUS
|
||||
OP_TYPEMINPLUS
|
||||
OP_TYPEPOSPLUS
|
||||
OP_TYPEQUERY
|
||||
OP_TYPEMINQUERY
|
||||
OP_TYPEPOSQUERY
|
||||
OP_TYPEUPTO
|
||||
OP_TYPEMINUPTO
|
||||
OP_TYPEPOSUPTO
|
||||
OP_TYPEEXACT
|
||||
|
||||
|
||||
@@ -216,9 +259,10 @@ OP_REF is followed by two bytes containing the reference number.
|
||||
Repeating character classes and back references
|
||||
-----------------------------------------------
|
||||
|
||||
Single-character classes are handled specially (see above). This applies to
|
||||
OP_CLASS and OP_REF. In both cases, the repeat information follows the base
|
||||
item. The matching code looks at the following opcode to see if it is one of
|
||||
Single-character classes are handled specially (see above). This section
|
||||
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
|
||||
the base item. The matching code looks at the following opcode to see if it is
|
||||
one of
|
||||
|
||||
OP_CRSTAR
|
||||
OP_CRMINSTAR
|
||||
@@ -230,7 +274,9 @@ item. The matching code looks at the following opcode to see if it is one of
|
||||
OP_CRMINRANGE
|
||||
|
||||
All but the last two are just single-byte items. The others are followed by
|
||||
four bytes of data, comprising the minimum and maximum repeat counts.
|
||||
four bytes of data, comprising the minimum and maximum repeat counts. There are
|
||||
no special possessive opcodes for these repeats; a possessive repeat is
|
||||
compiled into an atomic group.
|
||||
|
||||
|
||||
Brackets and alternation
|
||||
@@ -239,29 +285,25 @@ Brackets and alternation
|
||||
A pair of non-capturing (round) brackets is wrapped round each expression at
|
||||
compile time, so alternation always happens in the context of brackets.
|
||||
|
||||
Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
|
||||
OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
|
||||
speakers, including myself, can be round, square, curly, or pointy. Hence this
|
||||
usage.]
|
||||
[Note for North Americans: "bracket" to some English speakers, including
|
||||
myself, can be round, square, curly, or pointy. Hence this usage.]
|
||||
|
||||
Originally PCRE was limited to 99 capturing brackets (so as not to use up all
|
||||
the opcodes). From release 3.5, there is no limit. What happens is that the
|
||||
first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as
|
||||
above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the
|
||||
first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket
|
||||
number. This opcode is ignored while matching, but is fished out when handling
|
||||
the bracket itself. (They could have all been done like this, but I was making
|
||||
minimal changes.)
|
||||
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
|
||||
capturing brackets and it used a different opcode for each one. From release
|
||||
3.5, the limit was removed by putting the bracket number into the data for
|
||||
higher-numbered brackets. From release 7.0 all capturing brackets are handled
|
||||
this way, using the single opcode OP_CBRA.
|
||||
|
||||
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
|
||||
next alternative OP_ALT or, if there aren't any branches, to the matching
|
||||
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
|
||||
the next one, or to the OP_KET opcode.
|
||||
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
|
||||
number immediately follows the offset, always as a 2-byte item.
|
||||
|
||||
OP_KET is used for subpatterns that do not repeat indefinitely, while
|
||||
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
|
||||
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
|
||||
positive number) the offset back to the matching OP_BRA opcode.
|
||||
positive number) the offset back to the matching bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
|
||||
@@ -276,7 +318,14 @@ as appropriate.
|
||||
A subpattern with a bounded maximum repetition is replicated in a nested
|
||||
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
|
||||
before each replication after the minimum, so that, for example, (abc){2,5} is
|
||||
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
|
||||
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group
|
||||
has the same number.
|
||||
|
||||
When a repeated subpattern has an unbounded upper limit, it is checked to see
|
||||
whether it could match an empty string. If this is the case, the opcode in the
|
||||
final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
|
||||
that it needs to check for matching an empty string when it hits OP_KETRMIN or
|
||||
OP_KETRMAX, and if so, to break the loop.
|
||||
|
||||
|
||||
Assertions
|
||||
@@ -292,22 +341,27 @@ each alternative of a lookbehind assertion, allowing them to have different
|
||||
fixed lengths.
|
||||
|
||||
|
||||
Once-only subpatterns
|
||||
---------------------
|
||||
Once-only (atomic) subpatterns
|
||||
------------------------------
|
||||
|
||||
These are also just like other subpatterns, but they start with the opcode
|
||||
OP_ONCE.
|
||||
OP_ONCE. The check for matching an empty string in an unbounded repeat is
|
||||
handled entirely at runtime, so there is just this one opcode.
|
||||
|
||||
|
||||
Conditional subpatterns
|
||||
-----------------------
|
||||
|
||||
These are like other subpatterns, but they start with the opcode OP_COND. If
|
||||
These are like other subpatterns, but they start with the opcode OP_COND, or
|
||||
OP_SCOND for one that might match an empty string in an unbounded repeat. If
|
||||
the condition is a back reference, this is stored at the start of the
|
||||
subpattern using the opcode OP_CREF followed by two bytes containing the
|
||||
reference number. If the condition is "in recursion" (coded as "(?(R)"), the
|
||||
same scheme is used, with a "reference number" of 0xffff. Otherwise, a
|
||||
conditional subpattern always starts with one of the assertions.
|
||||
reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in
|
||||
recursion of group x" (coded as "(?(Rx)"), the group number is stored at the
|
||||
start of the subpattern using the opcode OP_RREF, and a value of zero for "the
|
||||
whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it
|
||||
has no associated data). Otherwise, a conditional subpattern always starts with
|
||||
one of the assertions.
|
||||
|
||||
|
||||
Recursion
|
||||
@@ -345,4 +399,4 @@ at compile time, and so does not cause anything to be put into the compiled
|
||||
data.
|
||||
|
||||
Philip Hazel
|
||||
June 2006
|
||||
November 2006
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2005 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -38,7 +38,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#ifndef _PCRE_H
|
||||
#define _PCRE_H
|
||||
|
||||
|
||||
#include "php_compat.h"
|
||||
|
||||
/* The current PCRE version information. */
|
||||
@@ -54,10 +54,10 @@ and libpcre.pc. The values are not put into configure.ac and substituted here
|
||||
cannot run ./configure. As it now stands, this file need not be edited in that
|
||||
circumstance. */
|
||||
|
||||
#define PCRE_MAJOR 6
|
||||
#define PCRE_MINOR 7
|
||||
#define PCRE_MAJOR 7
|
||||
#define PCRE_MINOR 0
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 04-Jul-2006
|
||||
#define PCRE_DATE 18-Dec-2006
|
||||
|
||||
/* Win32 uses DLL by default; it needs special stuff for exported functions
|
||||
when building PCRE. */
|
||||
@@ -120,6 +120,7 @@ extern "C" {
|
||||
#define PCRE_NEWLINE_CR 0x00100000
|
||||
#define PCRE_NEWLINE_LF 0x00200000
|
||||
#define PCRE_NEWLINE_CRLF 0x00300000
|
||||
#define PCRE_NEWLINE_ANY 0x00400000
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@@ -127,7 +128,8 @@ extern "C" {
|
||||
#define PCRE_ERROR_NULL (-2)
|
||||
#define PCRE_ERROR_BADOPTION (-3)
|
||||
#define PCRE_ERROR_BADMAGIC (-4)
|
||||
#define PCRE_ERROR_UNKNOWN_NODE (-5)
|
||||
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
||||
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
|
||||
#define PCRE_ERROR_NOMEMORY (-6)
|
||||
#define PCRE_ERROR_NOSUBSTRING (-7)
|
||||
#define PCRE_ERROR_MATCHLIMIT (-8)
|
||||
@@ -144,6 +146,8 @@ extern "C" {
|
||||
#define PCRE_ERROR_DFA_WSSIZE (-19)
|
||||
#define PCRE_ERROR_DFA_RECURSE (-20)
|
||||
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22)
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -436,7 +436,6 @@ pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
|
||||
int n = get_first_set(code, stringname, ovector);
|
||||
if (n <= 0) return n;
|
||||
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -51,6 +51,18 @@ differently, and global variables are not used (see pcre.in). */
|
||||
|
||||
|
||||
#ifndef VPCOMPAT
|
||||
|
||||
/**************************************************************************
|
||||
This code used to be here for use when compiling as a C++ library. However,
|
||||
according to Dair Grant it is not needed: "
|
||||
|
||||
Including 'extern "C"' in the declaration generates an "initialized and
|
||||
declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h,
|
||||
which includes pcre.h, which declares these prototypes within an extern "C" {}
|
||||
block, we shouldn't need the prefix here.
|
||||
|
||||
So, from Release 7.0 I have cut this out.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" void *(*pcre_malloc)(size_t) = malloc;
|
||||
extern "C" void (*pcre_free)(void *) = free;
|
||||
@@ -58,12 +70,13 @@ extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
|
||||
extern "C" void (*pcre_stack_free)(void *) = free;
|
||||
extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
#else
|
||||
**************************************************************************/
|
||||
|
||||
void *(*pcre_malloc)(size_t) = malloc;
|
||||
void (*pcre_free)(void *) = free;
|
||||
void *(*pcre_stack_malloc)(size_t) = malloc;
|
||||
void (*pcre_stack_free)(void *) = free;
|
||||
int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* End of pcre_globals.c */
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -54,12 +54,16 @@ functions whose names all begin with "_pcre_". */
|
||||
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
|
||||
inline, and there are *still* stupid compilers about that don't like indented
|
||||
pre-processor statements, or at least there were when I first wrote this. After
|
||||
all, it had only been about 10 years then... */
|
||||
all, it had only been about 10 years then...
|
||||
|
||||
It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
|
||||
be absolutely sure we get our version. */
|
||||
|
||||
#undef DPRINTF
|
||||
#ifdef DEBUG
|
||||
#define DPRINTF(p) printf p
|
||||
#else
|
||||
#define DPRINTF(p) /*nothing*/
|
||||
#define DPRINTF(p) /* Nothing */
|
||||
#endif
|
||||
|
||||
|
||||
@@ -118,13 +122,48 @@ Unix, where it is defined in sys/types, so use "uschar" instead. */
|
||||
|
||||
typedef unsigned char uschar;
|
||||
|
||||
/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The
|
||||
following macro is used to package up testing for newlines. NLBLOCK is defined
|
||||
in the various modules to indicate in which datablock the parameters exist. */
|
||||
/* This is an unsigned int value that no character can ever have. UTF-8
|
||||
characters only go up to 0x7fffffff (though Unicode doesn't go beyond
|
||||
0x0010ffff). */
|
||||
|
||||
#define NOTACHAR 0xffffffff
|
||||
|
||||
/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
|
||||
and "all" at present). The following macros are used to package up testing for
|
||||
newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to
|
||||
indicate in which datablock the parameters exist, and what the start/end of
|
||||
string field names are. */
|
||||
|
||||
#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
|
||||
#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
|
||||
|
||||
/* This macro checks for a newline at the given position */
|
||||
|
||||
#define IS_NEWLINE(p) \
|
||||
((p)[0] == NLBLOCK->nl[0] && \
|
||||
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))
|
||||
((NLBLOCK->nltype != NLTYPE_FIXED)? \
|
||||
((p) < NLBLOCK->PSEND && \
|
||||
_pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \
|
||||
) \
|
||||
: \
|
||||
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
|
||||
(p)[0] == NLBLOCK->nl[0] && \
|
||||
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
|
||||
) \
|
||||
)
|
||||
|
||||
/* This macro checks for a newline immediately preceding the given position */
|
||||
|
||||
#define WAS_NEWLINE(p) \
|
||||
((NLBLOCK->nltype != NLTYPE_FIXED)? \
|
||||
((p) > NLBLOCK->PSSTART && \
|
||||
_pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \
|
||||
) \
|
||||
: \
|
||||
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
|
||||
(p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
|
||||
(NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
|
||||
) \
|
||||
)
|
||||
|
||||
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
|
||||
with a custom type. This makes it possible, for example, to allow pcre_exec()
|
||||
@@ -282,7 +321,7 @@ we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHAR(c, eptr) \
|
||||
c = *eptr; \
|
||||
if ((c & 0xc0) == 0xc0) \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
@@ -300,7 +339,7 @@ pointer. */
|
||||
|
||||
#define GETCHARTEST(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (utf8 && (c & 0xc0) == 0xc0) \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
@@ -318,7 +357,7 @@ know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINC(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if ((c & 0xc0) == 0xc0) \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
@@ -334,7 +373,7 @@ know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINCTEST(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (utf8 && (c & 0xc0) == 0xc0) \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
{ \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
@@ -351,7 +390,7 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLEN(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if ((c & 0xc0) == 0xc0) \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
@@ -404,20 +443,21 @@ bits. */
|
||||
/* Masks for identifying the public options that are permitted at compile
|
||||
time, run time, or study time, respectively. */
|
||||
|
||||
#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY)
|
||||
|
||||
#define PUBLIC_OPTIONS \
|
||||
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
|
||||
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
|
||||
PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
|
||||
PCRE_DUPNAMES|PCRE_NEWLINE_BITS)
|
||||
|
||||
#define PUBLIC_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
|
||||
PCRE_PARTIAL|PCRE_NEWLINE_BITS)
|
||||
|
||||
#define PUBLIC_DFA_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \
|
||||
PCRE_NEWLINE_LF)
|
||||
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS)
|
||||
|
||||
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
||||
|
||||
@@ -449,9 +489,7 @@ typedef int BOOL;
|
||||
#define FALSE 0
|
||||
#define TRUE 1
|
||||
|
||||
/* Escape items that are just an encoding of a particular data value. Note that
|
||||
ESC_n is defined as yet another macro, which is set in config.h to either \n
|
||||
(the default) or \r (which some people want). */
|
||||
/* Escape items that are just an encoding of a particular data value. */
|
||||
|
||||
#ifndef ESC_e
|
||||
#define ESC_e 27
|
||||
@@ -462,7 +500,7 @@ ESC_n is defined as yet another macro, which is set in config.h to either \n
|
||||
#endif
|
||||
|
||||
#ifndef ESC_n
|
||||
#define ESC_n NEWLINE
|
||||
#define ESC_n '\n'
|
||||
#endif
|
||||
|
||||
#ifndef ESC_r
|
||||
@@ -501,21 +539,28 @@ value such as \n. They must have non-zero values, as check_escape() returns
|
||||
their negation. Also, they must appear in the same order as in the opcode
|
||||
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
|
||||
corresponds to "." rather than an escape sequence. The final one must be
|
||||
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
|
||||
tests in the code for an escape greater than ESC_b and less than ESC_Z to
|
||||
detect the types that may be repeated. These are the types that consume
|
||||
characters. If any new escapes are put in between that don't consume a
|
||||
ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc).
|
||||
There are two tests in the code for an escape greater than ESC_b and less than
|
||||
ESC_Z to detect the types that may be repeated. These are the types that
|
||||
consume characters. If any new escapes are put in between that don't consume a
|
||||
character, that code will have to change. */
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
|
||||
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
|
||||
ESC_Q, ESC_REF };
|
||||
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z,
|
||||
ESC_E, ESC_Q, ESC_k, ESC_REF };
|
||||
|
||||
|
||||
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
|
||||
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
|
||||
OP_EOD must correspond in order to the list of escapes immediately above.
|
||||
Note that whenever this list is updated, the two macro definitions that follow
|
||||
must also be updated to match. */
|
||||
|
||||
To keep stored, compiled patterns compatible, new opcodes should be added
|
||||
immediately before OP_BRA, where (since release 7.0) a gap is left for this
|
||||
purpose.
|
||||
|
||||
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
|
||||
that follow must also be updated to match. There is also a table called
|
||||
"coptable" in pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
enum {
|
||||
OP_END, /* 0 End of pattern */
|
||||
@@ -536,111 +581,123 @@ enum {
|
||||
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_NOTPROP, /* 13 \P (not Unicode property) */
|
||||
OP_PROP, /* 14 \p (Unicode property) */
|
||||
OP_EXTUNI, /* 15 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 16 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 17 End of data: \z */
|
||||
OP_ANYNL, /* 15 \R (any newline sequence) */
|
||||
OP_EXTUNI, /* 16 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 17 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 18 End of data: \z */
|
||||
|
||||
OP_OPT, /* 18 Set runtime options */
|
||||
OP_CIRC, /* 19 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 20 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 21 Match one character, casefully */
|
||||
OP_CHARNC, /* 22 Match one character, caselessly */
|
||||
OP_NOT, /* 23 Match one character, not the following one */
|
||||
OP_OPT, /* 19 Set runtime options */
|
||||
OP_CIRC, /* 20 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 21 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 22 Match one character, casefully */
|
||||
OP_CHARNC, /* 23 Match one character, caselessly */
|
||||
OP_NOT, /* 24 Match one character, not the following one */
|
||||
|
||||
OP_STAR, /* 24 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 26 the minimizing one second. */
|
||||
OP_MINPLUS, /* 27 This first set applies to single characters */
|
||||
OP_QUERY, /* 28 */
|
||||
OP_MINQUERY, /* 29 */
|
||||
OP_UPTO, /* 30 From 0 to n matches */
|
||||
OP_MINUPTO, /* 31 */
|
||||
OP_EXACT, /* 32 Exactly n matches */
|
||||
OP_STAR, /* 25 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 27 the minimizing one second. */
|
||||
OP_MINPLUS, /* 28 This first set applies to single characters.*/
|
||||
OP_QUERY, /* 29 */
|
||||
OP_MINQUERY, /* 30 */
|
||||
|
||||
OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 35 the minimizing one second. */
|
||||
OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
|
||||
OP_NOTQUERY, /* 37 */
|
||||
OP_NOTMINQUERY, /* 38 */
|
||||
OP_NOTUPTO, /* 39 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 40 */
|
||||
OP_NOTEXACT, /* 41 Exactly n matches */
|
||||
OP_UPTO, /* 31 From 0 to n matches */
|
||||
OP_MINUPTO, /* 32 */
|
||||
OP_EXACT, /* 33 Exactly n matches */
|
||||
|
||||
OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 47 */
|
||||
OP_TYPEUPTO, /* 48 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 49 */
|
||||
OP_TYPEEXACT, /* 50 Exactly n matches */
|
||||
OP_POSSTAR, /* 34 Possessified star */
|
||||
OP_POSPLUS, /* 35 Possessified plus */
|
||||
OP_POSQUERY, /* 36 Posesssified query */
|
||||
OP_POSUPTO, /* 37 Possessified upto */
|
||||
|
||||
OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 53 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 55 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 56 */
|
||||
OP_CRRANGE, /* 57 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 58 */
|
||||
OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 40 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 41 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 42 This set applies to "not" single characters. */
|
||||
OP_NOTMINQUERY, /* 43 */
|
||||
|
||||
OP_CLASS, /* 59 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
|
||||
OP_NOTUPTO, /* 44 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 45 */
|
||||
OP_NOTEXACT, /* 46 Exactly n matches */
|
||||
|
||||
OP_NOTPOSSTAR, /* 47 Possessified versions */
|
||||
OP_NOTPOSPLUS, /* 48 */
|
||||
OP_NOTPOSQUERY, /* 49 */
|
||||
OP_NOTPOSUPTO, /* 50 */
|
||||
|
||||
OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 55 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 56 */
|
||||
|
||||
OP_TYPEUPTO, /* 57 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 58 */
|
||||
OP_TYPEEXACT, /* 59 Exactly n matches */
|
||||
|
||||
OP_TYPEPOSSTAR, /* 60 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 61 */
|
||||
OP_TYPEPOSQUERY, /* 62 */
|
||||
OP_TYPEPOSUPTO, /* 63 */
|
||||
|
||||
OP_CRSTAR, /* 64 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 66 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 68 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 69 */
|
||||
OP_CRRANGE, /* 70 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 71 */
|
||||
|
||||
OP_CLASS, /* 72 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 73 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a UTF-8
|
||||
character > 255 is encountered. */
|
||||
|
||||
OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
|
||||
OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the
|
||||
class. This does both positive and negative. */
|
||||
|
||||
OP_REF, /* 62 Match a back reference */
|
||||
OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 64 Call out to external function if provided */
|
||||
OP_REF, /* 75 Match a back reference */
|
||||
OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 77 Call out to external function if provided */
|
||||
|
||||
OP_ALT, /* 65 Start of alternation */
|
||||
OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 67 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
|
||||
OP_ALT, /* 78 Start of alternation */
|
||||
OP_KET, /* 79 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 80 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */
|
||||
|
||||
/* The assertions must come before ONCE and COND */
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/
|
||||
|
||||
OP_ASSERT, /* 69 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 70 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 71 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
|
||||
OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 82 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 83 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 84 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */
|
||||
OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */
|
||||
|
||||
/* ONCE and COND must come after the assertions, with ONCE first, as there's
|
||||
a test for >= ONCE for a subpattern that isn't an assertion. */
|
||||
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
|
||||
as there's a test for >= ONCE for a subpattern that isn't an assertion. */
|
||||
|
||||
OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
|
||||
OP_COND, /* 75 Conditional group */
|
||||
OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
|
||||
OP_ONCE, /* 87 Atomic group */
|
||||
OP_BRA, /* 88 Start of non-capturing bracket */
|
||||
OP_CBRA, /* 89 Start of capturing bracket */
|
||||
OP_COND, /* 90 Conditional group */
|
||||
|
||||
OP_BRAZERO, /* 77 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 78 order. */
|
||||
/* These three must follow the previous three, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
|
||||
than can fit into an opcode. */
|
||||
OP_SBRA, /* 91 Start of non-capturing bracket, check empty */
|
||||
OP_SCBRA, /* 92 Start of capturing bracket, check empty */
|
||||
OP_SCOND, /* 93 Conditional group, check empty */
|
||||
|
||||
OP_BRA /* 80 This and greater values are used for brackets that
|
||||
extract substrings up to EXTRACT_BASIC_MAX. After
|
||||
that, use is made of OP_BRANUMBER. */
|
||||
OP_CREF, /* 94 Used to hold a capture number as condition */
|
||||
OP_RREF, /* 95 Used to hold a recursion number as condition */
|
||||
OP_DEF, /* 96 The DEFINE condition */
|
||||
|
||||
OP_BRAZERO, /* 97 These two must remain together and in this */
|
||||
OP_BRAMINZERO /* 98 order. */
|
||||
};
|
||||
|
||||
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
|
||||
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
|
||||
character sequences easier. */
|
||||
|
||||
/* The highest extraction number before we have to start using additional
|
||||
bytes. (Originally PCRE didn't have support for extraction counts highter than
|
||||
this number.) The value is limited by the number of opcodes left after OP_BRA,
|
||||
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
|
||||
opcodes. */
|
||||
|
||||
#define EXTRACT_BASIC_MAX 100
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
@@ -648,17 +705,21 @@ for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
#define OP_NAME_LIST \
|
||||
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
|
||||
"notprop", "prop", "extuni", \
|
||||
"notprop", "prop", "anynl", "extuni", \
|
||||
"\\Z", "\\z", \
|
||||
"Opt", "^", "$", "char", "charnc", "not", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", \
|
||||
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
|
||||
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
|
||||
"Brazero", "Braminzero", "Branumber", "Bra"
|
||||
"AssertB", "AssertB not", "Reverse", \
|
||||
"Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \
|
||||
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero"
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
@@ -674,7 +735,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1, /* End */ \
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, /* Any, Anybyte */ \
|
||||
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
|
||||
3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \
|
||||
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Charnc - the minimum length */ \
|
||||
@@ -682,12 +743,15 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
/* Positive single-char repeats ** These are */ \
|
||||
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
||||
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
|
||||
2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
|
||||
/* Negative single-char repeats - only for chars < 256 */ \
|
||||
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
|
||||
4, 4, 4, /* NOT upto, minupto, exact */ \
|
||||
2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
|
||||
/* Positive type repeats */ \
|
||||
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
|
||||
4, 4, 4, /* Type upto, minupto, exact */ \
|
||||
2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
|
||||
/* Character class & ref repeats */ \
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
|
||||
5, 5, /* CRRANGE, CRMINRANGE */ \
|
||||
@@ -706,17 +770,22 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1+LINK_SIZE, /* Assert behind */ \
|
||||
1+LINK_SIZE, /* Assert behind not */ \
|
||||
1+LINK_SIZE, /* Reverse */ \
|
||||
1+LINK_SIZE, /* Once */ \
|
||||
1+LINK_SIZE, /* ONCE */ \
|
||||
1+LINK_SIZE, /* BRA */ \
|
||||
3+LINK_SIZE, /* CBRA */ \
|
||||
1+LINK_SIZE, /* COND */ \
|
||||
1+LINK_SIZE, /* SBRA */ \
|
||||
3+LINK_SIZE, /* SCBRA */ \
|
||||
1+LINK_SIZE, /* SCOND */ \
|
||||
3, /* CREF */ \
|
||||
3, /* RREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, /* BRAZERO, BRAMINZERO */ \
|
||||
3, /* BRANUMBER */ \
|
||||
1+LINK_SIZE /* BRA */ \
|
||||
|
||||
|
||||
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
|
||||
#define CREF_RECURSE 0xffff
|
||||
#define RREF_ANY 0xffff
|
||||
|
||||
/* Error code numbers. They are given names so that they can more easily be
|
||||
tracked. */
|
||||
@@ -726,7 +795,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
|
||||
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
||||
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
||||
ERR50, ERR51 };
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 };
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
@@ -781,17 +850,23 @@ typedef struct compile_data {
|
||||
const uschar *fcc; /* Points to case-flipping table */
|
||||
const uschar *cbits; /* Points to character type table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
const uschar *start_workspace;/* The start of working space */
|
||||
const uschar *start_code; /* The start of the compiled code */
|
||||
const uschar *start_pattern; /* The start of the pattern */
|
||||
const uschar *end_pattern; /* The end of the pattern */
|
||||
uschar *hwm; /* High watermark of workspace */
|
||||
uschar *name_table; /* The name/number table */
|
||||
int names_found; /* Number of entries so far */
|
||||
int name_entry_size; /* Size of each entry */
|
||||
int bracount; /* Count of capturing parens */
|
||||
int top_backref; /* Maximum back reference */
|
||||
unsigned int backref_map; /* Bitmap of low back refs */
|
||||
int external_options; /* External (initial) options */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL nopartial; /* Set TRUE if partial won't work */
|
||||
int nllen; /* 1 or 2 for newline string length */
|
||||
uschar nl[4]; /* Newline string */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed length */
|
||||
} compile_data;
|
||||
|
||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||
@@ -824,6 +899,16 @@ This isn't used for a "normal" compilation of pcre. */
|
||||
|
||||
struct heapframe;
|
||||
|
||||
/* Structure for building a chain of data for holding the values of the subject
|
||||
pointer at the start of each subpattern, so as to detect when an empty string
|
||||
has been matched by a subpattern - to break infinite loops. */
|
||||
|
||||
typedef struct eptrblock {
|
||||
struct eptrblock *epb_prev;
|
||||
USPTR epb_saved_eptr;
|
||||
} eptrblock;
|
||||
|
||||
|
||||
/* Structure for passing "static" information around between the functions
|
||||
doing traditional NFA matching, so that they are thread-safe. */
|
||||
|
||||
@@ -834,8 +919,9 @@ typedef struct match_data {
|
||||
int *offset_vector; /* Offset vector */
|
||||
int offset_end; /* One past the end */
|
||||
int offset_max; /* The maximum usable for return data */
|
||||
int nllen; /* 1 or 2 for newline string length */
|
||||
uschar nl[4]; /* Newline string */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
const uschar *lcc; /* Points to lower casing table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
BOOL offset_overflow; /* Set if too many extractions */
|
||||
@@ -854,6 +940,8 @@ typedef struct match_data {
|
||||
int end_offset_top; /* Highwater mark at end of match */
|
||||
int capture_last; /* Most recent capture number */
|
||||
int start_offset; /* The start offset value */
|
||||
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
|
||||
int eptrn; /* Next free eptrblock */
|
||||
recursion_info *recursive; /* Linked list of recursion data */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
struct heapframe *thisframe; /* Used only when compiling for no recursion */
|
||||
@@ -869,8 +957,9 @@ typedef struct dfa_match_data {
|
||||
const uschar *tables; /* Character tables */
|
||||
int moptions; /* Match options */
|
||||
int poptions; /* Pattern options */
|
||||
int nllen; /* 1 or 2 for newline string length */
|
||||
uschar nl[4]; /* Newline string */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
} dfa_match_data;
|
||||
|
||||
@@ -941,13 +1030,17 @@ extern const uschar _pcre_OP_lengths[];
|
||||
one of the exported public functions. They have to be "external" in the C
|
||||
sense, but are not part of the PCRE public API. */
|
||||
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
|
||||
extern int _pcre_ucp_othercase(const int);
|
||||
extern int _pcre_valid_utf8(const uschar *, int);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *,
|
||||
BOOL);
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
|
||||
extern unsigned int _pcre_ucp_othercase(const unsigned int);
|
||||
extern int _pcre_valid_utf8(const uschar *, int);
|
||||
extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *,
|
||||
BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -130,7 +130,7 @@ for (i = 0; i < 256; i++)
|
||||
meta-character, which in this sense is any character that terminates a run
|
||||
of data characters. */
|
||||
|
||||
if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
|
||||
if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
|
||||
*p++ = x;
|
||||
}
|
||||
|
||||
|
||||
135
ext/pcre/pcrelib/pcre_newline.c
Normal file
135
ext/pcre/pcrelib/pcre_newline.c
Normal file
@@ -0,0 +1,135 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains internal functions for testing newlines when more than
|
||||
one kind of newline is to be recognized. When a newline is found, its length is
|
||||
returned. In principle, we could implement several newline "types", each
|
||||
referring to a different set of newline characters. At present, PCRE supports
|
||||
only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL,
|
||||
so for now the type isn't passed into the functions. It can easily be added
|
||||
later if required. The full list of Unicode newline characters is taken from
|
||||
http://unicode.org/unicode/reports/tr18/. */
|
||||
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check for newline at given position *
|
||||
*************************************************/
|
||||
|
||||
/* It is guaranteed that the initial value of ptr is less than the end of the
|
||||
string that is being processed.
|
||||
|
||||
Arguments:
|
||||
ptr pointer to possible newline
|
||||
endptr pointer to the end of the string
|
||||
lenptr where to return the length
|
||||
utf8 TRUE if in utf8 mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr,
|
||||
BOOL utf8)
|
||||
{
|
||||
int c;
|
||||
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
|
||||
switch(c)
|
||||
{
|
||||
case 0x000a: /* LF */
|
||||
case 0x000b: /* VT */
|
||||
case 0x000c: *lenptr = 1; return TRUE; /* FF */
|
||||
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
|
||||
return TRUE; /* CR */
|
||||
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
||||
default: return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check for newline at previous position *
|
||||
*************************************************/
|
||||
|
||||
/* It is guaranteed that the initial value of ptr is greater than the start of
|
||||
the string that is being processed.
|
||||
|
||||
Arguments:
|
||||
ptr pointer to possible newline
|
||||
startptr pointer to the start of the string
|
||||
lenptr where to return the length
|
||||
utf8 TRUE if in utf8 mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr,
|
||||
BOOL utf8)
|
||||
{
|
||||
int c;
|
||||
ptr--;
|
||||
if (utf8)
|
||||
{
|
||||
BACKCHAR(ptr);
|
||||
GETCHAR(c, ptr);
|
||||
}
|
||||
else c = *ptr;
|
||||
switch(c)
|
||||
{
|
||||
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
|
||||
return TRUE; /* LF */
|
||||
case 0x000b: /* VT */
|
||||
case 0x000c: /* FF */
|
||||
case 0x000d: *lenptr = 1; return TRUE; /* CR */
|
||||
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
||||
default: return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre_newline.c */
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -49,9 +49,19 @@ local functions. This source file is used in two places:
|
||||
compiled regex for debugging purposes. */
|
||||
|
||||
|
||||
/* Macro that decides whether a character should be output as a literal or in
|
||||
hexadecimal. We don't use isprint() because that can vary from system to system
|
||||
(even without the use of locales) and we want the output always to be the same,
|
||||
for testing purposes. This macro is used in pcretest as well as in this file. */
|
||||
|
||||
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
|
||||
|
||||
/* The table of operator names. */
|
||||
|
||||
static const char *OP_names[] = { OP_NAME_LIST };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print single- or multi-byte character *
|
||||
*************************************************/
|
||||
@@ -63,7 +73,7 @@ int c = *ptr;
|
||||
|
||||
if (!utf8 || (c & 0xc0) != 0xc0)
|
||||
{
|
||||
if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
@@ -160,16 +170,6 @@ for(;;)
|
||||
|
||||
fprintf(f, "%3d ", (int)(code - codestart));
|
||||
|
||||
if (*code >= OP_BRA)
|
||||
{
|
||||
if (*code - OP_BRA > EXTRACT_BASIC_MAX)
|
||||
fprintf(f, "%3d Bra extra\n", GET(code, 1));
|
||||
else
|
||||
fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
|
||||
code += _pcre_OP_lengths[OP_BRA];
|
||||
continue;
|
||||
}
|
||||
|
||||
switch(*code)
|
||||
{
|
||||
case OP_END:
|
||||
@@ -203,6 +203,14 @@ for(;;)
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
fprintf(f, "%3d %s %d", GET(code, 1), OP_names[*code],
|
||||
GET2(code, 1+LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_ALT:
|
||||
@@ -213,33 +221,45 @@ for(;;)
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
case OP_REVERSE:
|
||||
fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_BRANUMBER:
|
||||
printf("%3d %s", GET2(code, 1), OP_names[*code]);
|
||||
case OP_CREF:
|
||||
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CREF:
|
||||
if (GET2(code, 1) == CREF_RECURSE)
|
||||
fprintf(f, " Cond recurse");
|
||||
case OP_RREF:
|
||||
c = GET2(code, 1);
|
||||
if (c == RREF_ANY)
|
||||
fprintf(f, " Cond recurse any");
|
||||
else
|
||||
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
||||
fprintf(f, " Cond recurse %d", c);
|
||||
break;
|
||||
|
||||
case OP_DEF:
|
||||
fprintf(f, " Cond def");
|
||||
break;
|
||||
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSQUERY:
|
||||
fprintf(f, " ");
|
||||
if (*code >= OP_TYPESTAR)
|
||||
{
|
||||
@@ -257,17 +277,20 @@ for(;;)
|
||||
case OP_EXACT:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
fprintf(f, " ");
|
||||
extra = print_char(f, code+3, utf8);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_EXACT) fprintf(f, ",");
|
||||
if (*code != OP_EXACT) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_MINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_POSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEPOSUPTO:
|
||||
fprintf(f, " %s", OP_names[code[3]]);
|
||||
if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
|
||||
{
|
||||
@@ -278,20 +301,26 @@ for(;;)
|
||||
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_NOT:
|
||||
if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
|
||||
c = code[1];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
||||
else fprintf(f, " [^\\x%02x]", c);
|
||||
break;
|
||||
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
|
||||
case OP_NOTPOSQUERY:
|
||||
c = code[1];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
||||
else fprintf(f, " [^\\x%02x]", c);
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
@@ -299,11 +328,14 @@ for(;;)
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
|
||||
case OP_NOTPOSUPTO:
|
||||
c = code[3];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
|
||||
else fprintf(f, " [^\\x%02x]{", c);
|
||||
if (*code != OP_NOTEXACT) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_NOTMINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_RECURSE:
|
||||
@@ -363,12 +395,14 @@ for(;;)
|
||||
for (j = i+1; j < 256; j++)
|
||||
if ((ccode[j/8] & (1 << (j&7))) == 0) break;
|
||||
if (i == '-' || i == ']') fprintf(f, "\\");
|
||||
if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
|
||||
if (PRINTABLE(i)) fprintf(f, "%c", i);
|
||||
else fprintf(f, "\\x%02x", i);
|
||||
if (--j > i)
|
||||
{
|
||||
if (j != i + 1) fprintf(f, "-");
|
||||
if (j == '-' || j == ']') fprintf(f, "\\");
|
||||
if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
|
||||
if (PRINTABLE(j)) fprintf(f, "%c", j);
|
||||
else fprintf(f, "\\x%02x", j);
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -43,6 +43,7 @@ Scanner::Scanner()
|
||||
input_(data_),
|
||||
skip_(NULL),
|
||||
should_skip_(false),
|
||||
skip_repeat_(false),
|
||||
save_comments_(false),
|
||||
comments_(NULL),
|
||||
comments_offset_(0) {
|
||||
@@ -53,6 +54,7 @@ Scanner::Scanner(const string& in)
|
||||
input_(data_),
|
||||
skip_(NULL),
|
||||
should_skip_(false),
|
||||
skip_repeat_(false),
|
||||
save_comments_(false),
|
||||
comments_(NULL),
|
||||
comments_offset_(0) {
|
||||
@@ -63,15 +65,31 @@ Scanner::~Scanner() {
|
||||
delete comments_;
|
||||
}
|
||||
|
||||
void Scanner::SetSkipExpression(const char* re) {
|
||||
delete skip_;
|
||||
if (re != NULL) {
|
||||
skip_ = new RE(re);
|
||||
should_skip_ = true;
|
||||
skip_repeat_ = true;
|
||||
ConsumeSkip();
|
||||
} else {
|
||||
skip_ = NULL;
|
||||
should_skip_ = false;
|
||||
skip_repeat_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void Scanner::Skip(const char* re) {
|
||||
delete skip_;
|
||||
if (re != NULL) {
|
||||
skip_ = new RE(re);
|
||||
should_skip_ = true;
|
||||
skip_repeat_ = false;
|
||||
ConsumeSkip();
|
||||
} else {
|
||||
skip_ = NULL;
|
||||
should_skip_ = false;
|
||||
skip_repeat_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,19 +136,22 @@ bool Scanner::Consume(const RE& re,
|
||||
|
||||
// helper function to consume *skip_ and honour save_comments_
|
||||
void Scanner::ConsumeSkip() {
|
||||
const char* start_data = input_.data();
|
||||
while (skip_->Consume(&input_)) {
|
||||
if (!skip_repeat_) {
|
||||
// Only one skip allowed.
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (save_comments_) {
|
||||
if (NULL == comments_) {
|
||||
if (comments_ == NULL) {
|
||||
comments_ = new vector<StringPiece>;
|
||||
}
|
||||
const char *start_data = input_.data();
|
||||
skip_->Consume(&input_);
|
||||
// already pointing one past end, so no need to +1
|
||||
int length = input_.data() - start_data;
|
||||
if (length > 0) {
|
||||
comments_->push_back(StringPiece(start_data, length));
|
||||
}
|
||||
} else {
|
||||
skip_->Consume(&input_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@
|
||||
// Scanner scanner(input);
|
||||
// string var;
|
||||
// int number;
|
||||
// scanner.Skip("\\s+"); // Skip any white space we encounter
|
||||
// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
|
||||
// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
|
||||
// ...;
|
||||
// }
|
||||
@@ -90,10 +90,16 @@ class Scanner {
|
||||
// skipped. For example, a programming language scanner would use
|
||||
// a skip RE that matches white space and comments.
|
||||
//
|
||||
// scanner.Skip("(\\s|//.*|/[*](.|\n)*?[*]/)*");
|
||||
// scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
|
||||
//
|
||||
// Skipping repeats as long as it succeeds. We used to let people do
|
||||
// this by writing "(...)*" in the regular expression, but that added
|
||||
// up to lots of recursive calls within the pcre library, so now we
|
||||
// control repetition explicitly via the function call API.
|
||||
//
|
||||
// You can pass NULL for "re" if you do not want any data to be skipped.
|
||||
void Skip(const char* re);
|
||||
void Skip(const char* re); // DEPRECATED; does *not* repeat
|
||||
void SetSkipExpression(const char* re);
|
||||
|
||||
// Temporarily pause "skip"ing. This
|
||||
// Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
|
||||
@@ -109,12 +115,13 @@ class Scanner {
|
||||
/***** Special wrappers around SetSkip() for some common idioms *****/
|
||||
|
||||
// Arranges to skip whitespace, C comments, C++ comments.
|
||||
// The overall RE is a repeated disjunction of the following REs:
|
||||
// The overall RE is a disjunction of the following REs:
|
||||
// \\s whitespace
|
||||
// //.*\n C++ comment
|
||||
// /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x)
|
||||
// We get repetition via the semantics of SetSkipExpression, not by using *
|
||||
void SkipCXXComments() {
|
||||
Skip("((\\s|//.*\n|/[*](.|\n)*?[*]/)*)");
|
||||
SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
|
||||
}
|
||||
|
||||
void set_save_comments(bool comments) {
|
||||
@@ -143,6 +150,7 @@ class Scanner {
|
||||
StringPiece input_; // Unprocessed input
|
||||
RE* skip_; // If non-NULL, RE for skipping input
|
||||
bool should_skip_; // If true, use skip_
|
||||
bool skip_repeat_; // If true, repeat skip_ as long as it works
|
||||
bool save_comments_; // If true, aggregate the skip expression
|
||||
|
||||
// the skipped comments
|
||||
|
||||
@@ -33,10 +33,13 @@
|
||||
// functionality.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <pcre_stringpiece.h>
|
||||
#include <pcre_scanner.h>
|
||||
|
||||
#define FLAGS_unittest_stack_size 49152
|
||||
|
||||
// Dies with a fatal error if the two values are not equal.
|
||||
#define CHECK_EQ(a, b) do { \
|
||||
if ( (a) != (b) ) { \
|
||||
@@ -116,8 +119,31 @@ static void TestScanner() {
|
||||
comments.resize(0);
|
||||
}
|
||||
|
||||
static void TestBigComment() {
|
||||
string input;
|
||||
for (int i = 0; i < 1024; ++i) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), " # Comment %d\n", i);
|
||||
input += buf;
|
||||
}
|
||||
input += "name = value;\n";
|
||||
|
||||
Scanner s(input.c_str());
|
||||
s.SetSkipExpression("\\s+|#.*\n");
|
||||
|
||||
string name;
|
||||
string value;
|
||||
s.Consume("(\\w+) = (\\w+);", &name, &value);
|
||||
CHECK_EQ(name, "name");
|
||||
CHECK_EQ(value, "value");
|
||||
}
|
||||
|
||||
// TODO: also test scanner and big-comment in a thread with a
|
||||
// small stack size
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
TestScanner();
|
||||
TestBigComment();
|
||||
|
||||
// Done
|
||||
printf("OK\n");
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -45,6 +45,11 @@ supporting functions. */
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
/* Returns from set_start_bits() */
|
||||
|
||||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Set a bit and maybe its alternate case *
|
||||
*************************************************/
|
||||
@@ -72,12 +77,16 @@ if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Create bitmap of starting chars *
|
||||
* Create bitmap of starting bytes *
|
||||
*************************************************/
|
||||
|
||||
/* This function scans a compiled unanchored expression and attempts to build a
|
||||
bitmap of the set of initial characters. If it can't, it returns FALSE. As time
|
||||
goes by, we may be able to get more clever at doing this.
|
||||
/* This function scans a compiled unanchored expression recursively and
|
||||
attempts to build a bitmap of the set of possible starting bytes. As time goes
|
||||
by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
|
||||
useful for parenthesized groups in patterns such as (a*)b where the group
|
||||
provides some optional starting bytes but scanning must continue at the outer
|
||||
level to find at least one mandatory byte. At the outermost level, this
|
||||
function fails unless the result is SSB_DONE.
|
||||
|
||||
Arguments:
|
||||
code points to an expression
|
||||
@@ -86,14 +95,17 @@ Arguments:
|
||||
utf8 TRUE if in UTF-8 mode
|
||||
cd the block with char table pointers
|
||||
|
||||
Returns: TRUE if table built, FALSE otherwise
|
||||
Returns: SSB_FAIL => Failed to find any starting bytes
|
||||
SSB_DONE => Found mandatory starting bytes
|
||||
SSB_CONTINUE => Found optional starting bytes
|
||||
*/
|
||||
|
||||
static BOOL
|
||||
static int
|
||||
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
|
||||
BOOL utf8, compile_data *cd)
|
||||
{
|
||||
register int c;
|
||||
int yield = SSB_DONE;
|
||||
|
||||
#if 0
|
||||
/* ========================================================================= */
|
||||
@@ -114,25 +126,55 @@ volatile int dummy;
|
||||
|
||||
do
|
||||
{
|
||||
const uschar *tcode = code + 1 + LINK_SIZE;
|
||||
const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
|
||||
BOOL try_next = TRUE;
|
||||
|
||||
while (try_next)
|
||||
while (try_next) /* Loop for items in this branch */
|
||||
{
|
||||
/* If a branch starts with a bracket or a positive lookahead assertion,
|
||||
recurse to set bits from within them. That's all for this branch. */
|
||||
|
||||
if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
|
||||
int rc;
|
||||
switch(*tcode)
|
||||
{
|
||||
if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
|
||||
return FALSE;
|
||||
try_next = FALSE;
|
||||
}
|
||||
/* Fail if we reach something we don't understand */
|
||||
|
||||
else switch(*tcode)
|
||||
{
|
||||
default:
|
||||
return FALSE;
|
||||
return SSB_FAIL;
|
||||
|
||||
/* If we hit a bracket or a positive lookahead assertion, recurse to set
|
||||
bits from within the subpattern. If it can't find anything, we have to
|
||||
give up. If it finds some mandatory character(s), we are done for this
|
||||
branch. Otherwise, carry on scanning after the subpattern. */
|
||||
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_ONCE:
|
||||
case OP_ASSERT:
|
||||
rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
|
||||
if (rc == SSB_FAIL) return SSB_FAIL;
|
||||
if (rc == SSB_DONE) try_next = FALSE; else
|
||||
{
|
||||
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
|
||||
tcode += 1 + LINK_SIZE;
|
||||
}
|
||||
break;
|
||||
|
||||
/* If we hit ALT or KET, it means we haven't found anything mandatory in
|
||||
this branch, though we might have found something optional. For ALT, we
|
||||
continue with the next alternative, but we have to arrange that the final
|
||||
result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
|
||||
return SSB_CONTINUE: if this is the top level, that indicates failure,
|
||||
but after a nested subpattern, it causes scanning to continue. */
|
||||
|
||||
case OP_ALT:
|
||||
yield = SSB_CONTINUE;
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_KET:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
return SSB_CONTINUE;
|
||||
|
||||
/* Skip over callout */
|
||||
|
||||
@@ -140,19 +182,13 @@ do
|
||||
tcode += 2 + 2*LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Skip over extended extraction bracket number */
|
||||
|
||||
case OP_BRANUMBER:
|
||||
tcode += 3;
|
||||
break;
|
||||
|
||||
/* Skip over lookbehind and negative lookahead assertions */
|
||||
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
|
||||
tcode += 1+LINK_SIZE;
|
||||
tcode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Skip over an option setting, changing the caseless flag */
|
||||
@@ -166,27 +202,30 @@ do
|
||||
|
||||
case OP_BRAZERO:
|
||||
case OP_BRAMINZERO:
|
||||
if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
|
||||
return FALSE;
|
||||
if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
|
||||
return SSB_FAIL;
|
||||
/* =========================================================================
|
||||
See the comment at the head of this function concerning the next line,
|
||||
which was an old fudge for the benefit of OS/2.
|
||||
dummy = 1;
|
||||
========================================================================= */
|
||||
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
|
||||
tcode += 1+LINK_SIZE;
|
||||
tcode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Single-char * or ? sets the bit and tries the next item */
|
||||
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
set_bit(start_bits, tcode[1], caseless, cd);
|
||||
tcode += 2;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
||||
#endif
|
||||
break;
|
||||
|
||||
@@ -194,10 +233,12 @@ do
|
||||
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
set_bit(start_bits, tcode[3], caseless, cd);
|
||||
tcode += 4;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
||||
#endif
|
||||
break;
|
||||
|
||||
@@ -210,6 +251,7 @@ do
|
||||
case OP_CHARNC:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
set_bit(start_bits, tcode[1], caseless, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
@@ -283,16 +325,19 @@ do
|
||||
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEPOSUPTO:
|
||||
tcode += 2; /* Fall through */
|
||||
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSQUERY:
|
||||
switch(tcode[1])
|
||||
{
|
||||
case OP_ANY:
|
||||
return FALSE;
|
||||
return SSB_FAIL;
|
||||
|
||||
case OP_NOT_DIGIT:
|
||||
for (c = 0; c < 32; c++)
|
||||
@@ -418,7 +463,7 @@ do
|
||||
code += GET(code, 1); /* Advance to next branch */
|
||||
}
|
||||
while (*code == OP_ALT);
|
||||
return TRUE;
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
@@ -492,8 +537,8 @@ compile_block.ctypes = tables + ctypes_offset;
|
||||
/* See if we can find a fixed set of initial characters for the pattern. */
|
||||
|
||||
memset(start_bits, 0, 32 * sizeof(uschar));
|
||||
if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
|
||||
(re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
|
||||
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
|
||||
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
|
||||
|
||||
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
|
||||
the latter, which is pointed to by the former, which may also get additional
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -72,9 +72,8 @@ first byte of a character, indexed by the number of additional bytes. */
|
||||
const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
/* Table of the number of extra characters, indexed by the first character
|
||||
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
|
||||
0x3d. */
|
||||
/* Table of the number of extra bytes, indexed by the first byte masked with
|
||||
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
|
||||
|
||||
const uschar _pcre_utf8_table4[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
@@ -89,6 +88,7 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ "Any", PT_ANY, 0 },
|
||||
{ "Arabic", PT_SC, ucp_Arabic },
|
||||
{ "Armenian", PT_SC, ucp_Armenian },
|
||||
{ "Balinese", PT_SC, ucp_Balinese },
|
||||
{ "Bengali", PT_SC, ucp_Bengali },
|
||||
{ "Bopomofo", PT_SC, ucp_Bopomofo },
|
||||
{ "Braille", PT_SC, ucp_Braille },
|
||||
@@ -104,6 +104,7 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ "Common", PT_SC, ucp_Common },
|
||||
{ "Coptic", PT_SC, ucp_Coptic },
|
||||
{ "Cs", PT_PC, ucp_Cs },
|
||||
{ "Cuneiform", PT_SC, ucp_Cuneiform },
|
||||
{ "Cypriot", PT_SC, ucp_Cypriot },
|
||||
{ "Cyrillic", PT_SC, ucp_Cyrillic },
|
||||
{ "Deseret", PT_SC, ucp_Deseret },
|
||||
@@ -146,6 +147,7 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ "N", PT_GC, ucp_N },
|
||||
{ "Nd", PT_PC, ucp_Nd },
|
||||
{ "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue },
|
||||
{ "Nko", PT_SC, ucp_Nko },
|
||||
{ "Nl", PT_PC, ucp_Nl },
|
||||
{ "No", PT_PC, ucp_No },
|
||||
{ "Ogham", PT_SC, ucp_Ogham },
|
||||
@@ -158,6 +160,8 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ "Pd", PT_PC, ucp_Pd },
|
||||
{ "Pe", PT_PC, ucp_Pe },
|
||||
{ "Pf", PT_PC, ucp_Pf },
|
||||
{ "Phags_Pa", PT_SC, ucp_Phags_Pa },
|
||||
{ "Phoenician", PT_SC, ucp_Phoenician },
|
||||
{ "Pi", PT_PC, ucp_Pi },
|
||||
{ "Po", PT_PC, ucp_Po },
|
||||
{ "Ps", PT_PC, ucp_Ps },
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -131,11 +131,11 @@ letter, return the other case. Otherwise, return -1.
|
||||
Arguments:
|
||||
c the character value
|
||||
|
||||
Returns: the other case or -1 if none
|
||||
Returns: the other case or NOTACHAR if none
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_ucp_othercase(const int c)
|
||||
unsigned int
|
||||
_pcre_ucp_othercase(const unsigned int c)
|
||||
{
|
||||
int bot = 0;
|
||||
int top = sizeof(ucp_table)/sizeof(cnode);
|
||||
@@ -161,14 +161,14 @@ for (;;)
|
||||
}
|
||||
}
|
||||
|
||||
/* Found an entry in the table. Return -1 for a range entry. Otherwise return
|
||||
the other case if there is one, else -1. */
|
||||
/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
|
||||
return the other case if there is one, else NOTACHAR. */
|
||||
|
||||
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return -1;
|
||||
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
|
||||
|
||||
offset = ucp_table[mid].f1 & f1_casemask;
|
||||
if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
|
||||
return (offset == 0)? -1 : c + offset;
|
||||
return (offset == 0)? NOTACHAR : c + offset;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -79,7 +79,7 @@ for (p = string; length-- > 0; p++)
|
||||
register int ab;
|
||||
register int c = *p;
|
||||
if (c < 128) continue;
|
||||
if ((c & 0xc0) != 0xc0) return p - string;
|
||||
if (c < 0xc0) return p - string;
|
||||
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
if (length < ab) return p - string;
|
||||
length -= ab;
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -49,16 +49,38 @@ string that identifies the PCRE version that is in use. */
|
||||
* Return version string *
|
||||
*************************************************/
|
||||
|
||||
/* These macros are the standard way of turning unquoted text into C strings.
|
||||
They allow macros like PCRE_MAJOR to be defined without quotes, which is
|
||||
convenient for user programs that want to test its value. */
|
||||
|
||||
#define STRING(a) # a
|
||||
#define XSTRING(s) STRING(s)
|
||||
|
||||
/* A problem turned up with PCRE_PRERELEASE, which is defined empty for
|
||||
production releases. Originally, it was used naively in this code:
|
||||
|
||||
return XSTRING(PCRE_MAJOR)
|
||||
"." XSTRING(PCRE_MINOR)
|
||||
XSTRING(PCRE_PRERELEASE)
|
||||
" " XSTRING(PCRE_DATE);
|
||||
|
||||
However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of
|
||||
STRING(). The C standard states: "If (before argument substitution) any
|
||||
argument consists of no preprocessing tokens, the behavior is undefined." It
|
||||
turns out the gcc treats this case as a single empty string - which is what we
|
||||
really want - but Visual C grumbles about the lack of an argument for the
|
||||
macro. Unfortunately, both are within their rights. To cope with both ways of
|
||||
handling this, I had resort to some messy hackery that does a test at run time.
|
||||
I could find no way of detecting that a macro is defined as an empty string at
|
||||
pre-processor time. This hack uses a standard trick for avoiding calling
|
||||
the STRING macro with an empty argument when doing the test. */
|
||||
|
||||
PCRE_DATA_SCOPE const char *
|
||||
pcre_version(void)
|
||||
{
|
||||
return XSTRING(PCRE_MAJOR)
|
||||
"." XSTRING(PCRE_MINOR)
|
||||
XSTRING(PCRE_PRERELEASE)
|
||||
" " XSTRING(PCRE_DATE);
|
||||
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
|
||||
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
|
||||
XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE);
|
||||
}
|
||||
|
||||
/* End of pcre_version.c */
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -61,7 +61,7 @@ static const string empty_string;
|
||||
// If the user doesn't ask for any options, we just use this one
|
||||
static RE_Options default_options;
|
||||
|
||||
void RE::Init(const char* pat, const RE_Options* options) {
|
||||
void RE::Init(const string& pat, const RE_Options* options) {
|
||||
pattern_ = pat;
|
||||
if (options == NULL) {
|
||||
options_ = default_options;
|
||||
@@ -78,7 +78,7 @@ void RE::Init(const char* pat, const RE_Options* options) {
|
||||
// conservative in that it may treat some "simple" patterns
|
||||
// as "complex" (e.g., if the vertical bar is in a character
|
||||
// class or is escaped). But it seems good enough.
|
||||
if (strchr(pat, '|') == NULL) {
|
||||
if (strchr(pat.c_str(), '|') == NULL) {
|
||||
// Simple pattern: we can use position-based checks to perform
|
||||
// fully anchored matches
|
||||
re_full_ = re_partial_;
|
||||
@@ -89,12 +89,18 @@ void RE::Init(const char* pat, const RE_Options* options) {
|
||||
}
|
||||
}
|
||||
|
||||
RE::~RE() {
|
||||
void RE::Cleanup() {
|
||||
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);
|
||||
if (re_partial_ != NULL) (*pcre_free)(re_partial_);
|
||||
if (error_ != &empty_string) delete error_;
|
||||
}
|
||||
|
||||
|
||||
RE::~RE() {
|
||||
Cleanup();
|
||||
}
|
||||
|
||||
|
||||
pcre* RE::Compile(Anchor anchor) {
|
||||
// First, convert RE_Options into pcre options
|
||||
int pcre_options = 0;
|
||||
@@ -424,6 +430,34 @@ bool RE::Extract(const StringPiece& rewrite,
|
||||
return Rewrite(out, rewrite, text, vec, matches);
|
||||
}
|
||||
|
||||
/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
|
||||
string result;
|
||||
|
||||
// Escape any ascii character not in [A-Za-z_0-9].
|
||||
//
|
||||
// Note that it's legal to escape a character even if it has no
|
||||
// special meaning in a regular expression -- so this function does
|
||||
// that. (This also makes it identical to the perl function of the
|
||||
// same name; see `perldoc -f quotemeta`.)
|
||||
for (int ii = 0; ii < unquoted.size(); ++ii) {
|
||||
// Note that using 'isalnum' here raises the benchmark time from
|
||||
// 32ns to 58ns:
|
||||
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
||||
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
|
||||
(unquoted[ii] < '0' || unquoted[ii] > '9') &&
|
||||
unquoted[ii] != '_' &&
|
||||
// If this is the part of a UTF8 or Latin1 character, we need
|
||||
// to copy this byte without escaping. Experimentally this is
|
||||
// what works correctly with the regexp library.
|
||||
!(unquoted[ii] & 128)) {
|
||||
result += '\\';
|
||||
}
|
||||
result += unquoted[ii];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/***** Actual matching and rewriting code *****/
|
||||
|
||||
int RE::TryMatch(const StringPiece& text,
|
||||
@@ -809,14 +843,14 @@ bool Arg::parse_float(const char* str, int n, void* dest) {
|
||||
return parse_##name##_radix(str, n, dest, 0); \
|
||||
}
|
||||
|
||||
DEFINE_INTEGER_PARSERS(short);
|
||||
DEFINE_INTEGER_PARSERS(ushort);
|
||||
DEFINE_INTEGER_PARSERS(int);
|
||||
DEFINE_INTEGER_PARSERS(uint);
|
||||
DEFINE_INTEGER_PARSERS(long);
|
||||
DEFINE_INTEGER_PARSERS(ulong);
|
||||
DEFINE_INTEGER_PARSERS(longlong);
|
||||
DEFINE_INTEGER_PARSERS(ulonglong);
|
||||
DEFINE_INTEGER_PARSERS(short) /* */
|
||||
DEFINE_INTEGER_PARSERS(ushort) /* */
|
||||
DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
|
||||
DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
|
||||
DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
|
||||
DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
|
||||
DEFINE_INTEGER_PARSERS(longlong) /* */
|
||||
DEFINE_INTEGER_PARSERS(ulonglong) /* */
|
||||
|
||||
#undef DEFINE_INTEGER_PARSERS
|
||||
|
||||
|
||||
@@ -112,6 +112,12 @@
|
||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
||||
// NULL (the corresponding matched sub-pattern is not copied)
|
||||
//
|
||||
// CAVEAT: An optional sub-pattern that does not exist in the matched
|
||||
// string is assigned the empty string. Therefore, the following will
|
||||
// return false (because the empty string is not a valid number):
|
||||
// int number;
|
||||
// pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// DO_MATCH
|
||||
//
|
||||
@@ -488,8 +494,25 @@ class RE {
|
||||
// pass in a string or a "const char*" wherever an "RE" is expected.
|
||||
RE(const char* pat) { Init(pat, NULL); }
|
||||
RE(const char *pat, const RE_Options& option) { Init(pat, &option); }
|
||||
RE(const string& pat) { Init(pat.c_str(), NULL); }
|
||||
RE(const string& pat, const RE_Options& option) { Init(pat.c_str(), &option); }
|
||||
RE(const string& pat) { Init(pat, NULL); }
|
||||
RE(const string& pat, const RE_Options& option) { Init(pat, &option); }
|
||||
|
||||
// Copy constructor & assignment - note that these are expensive
|
||||
// because they recompile the expression.
|
||||
RE(const RE& re) { Init(re.pattern_, &re.options_); }
|
||||
const RE& operator=(const RE& re) {
|
||||
if (this != &re) {
|
||||
Cleanup();
|
||||
|
||||
// This is the code that originally came from Google
|
||||
// Init(re.pattern_.c_str(), &re.options_);
|
||||
|
||||
// This is the replacement from Ari Pollak
|
||||
Init(re.pattern_, &re.options_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
~RE();
|
||||
|
||||
@@ -589,6 +612,15 @@ class RE {
|
||||
const StringPiece &text,
|
||||
string *out) const;
|
||||
|
||||
// Escapes all potentially meaningful regexp characters in
|
||||
// 'unquoted'. The returned string, used as a regular expression,
|
||||
// will exactly match the original string. For example,
|
||||
// 1.5-2.0?
|
||||
// may become:
|
||||
// 1\.5\-2\.0\?
|
||||
static string QuoteMeta(const StringPiece& unquoted);
|
||||
|
||||
|
||||
/***** Generic matching interface *****/
|
||||
|
||||
// Type of match (TODO: Should be restructured as part of RE_Options)
|
||||
@@ -611,7 +643,8 @@ class RE {
|
||||
|
||||
private:
|
||||
|
||||
void Init(const char* pattern, const RE_Options* options);
|
||||
void Init(const string& pattern, const RE_Options* options);
|
||||
void Cleanup();
|
||||
|
||||
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
|
||||
// pairs of integers for the beginning and end positions of matched
|
||||
@@ -655,11 +688,6 @@ class RE {
|
||||
pcre* re_full_; // For full matches
|
||||
pcre* re_partial_; // For partial matches
|
||||
const string* error_; // Error indicator (or points to empty string)
|
||||
|
||||
// Don't allow the default copy or assignment constructors --
|
||||
// they're expensive and too easy to do by accident.
|
||||
RE(const RE&);
|
||||
void operator=(const RE&);
|
||||
};
|
||||
|
||||
} // namespace pcrecpp
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
// Copyright (c) 2005, Google Inc.
|
||||
// -*- coding: utf-8 -*-
|
||||
//
|
||||
// Copyright (c) 2005 - 2006, Google Inc.
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
@@ -445,6 +447,80 @@ static void TestRecursion() {
|
||||
CHECK(re4.FullMatch(text_bad) == false);
|
||||
}
|
||||
|
||||
// A meta-quoted string, interpreted as a pattern, should always match
|
||||
// the original unquoted string.
|
||||
static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) {
|
||||
string quoted = RE::QuoteMeta(unquoted);
|
||||
RE re(quoted, options);
|
||||
CHECK(re.FullMatch(unquoted));
|
||||
}
|
||||
|
||||
// A string containing meaningful regexp characters, which is then meta-
|
||||
// quoted, should not generally match a string the unquoted string does.
|
||||
static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
|
||||
RE_Options options = RE_Options()) {
|
||||
string quoted = RE::QuoteMeta(unquoted);
|
||||
RE re(quoted, options);
|
||||
CHECK(!re.FullMatch(should_not_match));
|
||||
}
|
||||
|
||||
// Tests that quoted meta characters match their original strings,
|
||||
// and that a few things that shouldn't match indeed do not.
|
||||
static void TestQuotaMetaSimple() {
|
||||
TestQuoteMeta("foo");
|
||||
TestQuoteMeta("foo.bar");
|
||||
TestQuoteMeta("foo\\.bar");
|
||||
TestQuoteMeta("[1-9]");
|
||||
TestQuoteMeta("1.5-2.0?");
|
||||
TestQuoteMeta("\\d");
|
||||
TestQuoteMeta("Who doesn't like ice cream?");
|
||||
TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
|
||||
TestQuoteMeta("((?!)xxx).*yyy");
|
||||
TestQuoteMeta("([");
|
||||
}
|
||||
|
||||
static void TestQuoteMetaSimpleNegative() {
|
||||
NegativeTestQuoteMeta("foo", "bar");
|
||||
NegativeTestQuoteMeta("...", "bar");
|
||||
NegativeTestQuoteMeta("\\.", ".");
|
||||
NegativeTestQuoteMeta("\\.", "..");
|
||||
NegativeTestQuoteMeta("(a)", "a");
|
||||
NegativeTestQuoteMeta("(a|b)", "a");
|
||||
NegativeTestQuoteMeta("(a|b)", "(a)");
|
||||
NegativeTestQuoteMeta("(a|b)", "a|b");
|
||||
NegativeTestQuoteMeta("[0-9]", "0");
|
||||
NegativeTestQuoteMeta("[0-9]", "0-9");
|
||||
NegativeTestQuoteMeta("[0-9]", "[9]");
|
||||
NegativeTestQuoteMeta("((?!)xxx)", "xxx");
|
||||
}
|
||||
|
||||
static void TestQuoteMetaLatin1() {
|
||||
TestQuoteMeta("3\xb2 = 9");
|
||||
}
|
||||
|
||||
static void TestQuoteMetaUtf8() {
|
||||
#ifdef SUPPORT_UTF8
|
||||
TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
|
||||
TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8
|
||||
TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol)
|
||||
TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character
|
||||
TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime)
|
||||
TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note)
|
||||
TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work
|
||||
NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol)
|
||||
"27\\\xc2\\\xb0",
|
||||
pcrecpp::UTF8());
|
||||
#endif
|
||||
}
|
||||
|
||||
static void TestQuoteMetaAll() {
|
||||
printf("Testing QuoteMeta\n");
|
||||
TestQuotaMetaSimple();
|
||||
TestQuoteMetaSimpleNegative();
|
||||
TestQuoteMetaLatin1();
|
||||
TestQuoteMetaUtf8();
|
||||
}
|
||||
|
||||
//
|
||||
// Options tests contributed by
|
||||
// Giuseppe Maxia, CTO, Stardata s.r.l.
|
||||
@@ -667,6 +743,35 @@ static void TestOptions() {
|
||||
Test_all_options();
|
||||
}
|
||||
|
||||
static void TestConstructors() {
|
||||
printf("Testing constructors\n");
|
||||
|
||||
RE_Options options;
|
||||
options.set_dotall(true);
|
||||
const char *str = "HELLO\n" "cruel\n" "world";
|
||||
|
||||
RE orig("HELLO.*world", options);
|
||||
CHECK(orig.FullMatch(str));
|
||||
|
||||
RE copy1(orig);
|
||||
CHECK(copy1.FullMatch(str));
|
||||
|
||||
RE copy2("not a match");
|
||||
CHECK(!copy2.FullMatch(str));
|
||||
copy2 = copy1;
|
||||
CHECK(copy2.FullMatch(str));
|
||||
copy2 = orig;
|
||||
CHECK(copy2.FullMatch(str));
|
||||
|
||||
// Make sure when we assign to ourselves, nothing bad happens
|
||||
orig = orig;
|
||||
copy1 = copy1;
|
||||
copy2 = copy2;
|
||||
CHECK(orig.FullMatch(str));
|
||||
CHECK(copy1.FullMatch(str));
|
||||
CHECK(copy2.FullMatch(str));
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
// Treat any flag as --help
|
||||
if (argc > 1 && argv[1][0] == '-') {
|
||||
@@ -985,11 +1090,14 @@ int main(int argc, char** argv) {
|
||||
CHECK(RE("h.*o").PartialMatch("hello!"));
|
||||
CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x"));
|
||||
|
||||
/***** other tests *****/
|
||||
|
||||
RadixTests();
|
||||
TestReplace();
|
||||
TestExtract();
|
||||
TestConsume();
|
||||
TestFindAndConsume();
|
||||
TestQuoteMetaAll();
|
||||
TestMatchNumberPeculiarity();
|
||||
|
||||
// Check the pattern() accessor
|
||||
@@ -1109,6 +1217,9 @@ int main(int argc, char** argv) {
|
||||
VERBOSE_TEST = true;
|
||||
TestOptions();
|
||||
|
||||
// Test the constructors
|
||||
TestConstructors();
|
||||
|
||||
// Done
|
||||
printf("OK\n");
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
its pattern matching. On a Unix or Win32 system it can recurse into
|
||||
directories.
|
||||
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -56,7 +56,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
typedef int BOOL;
|
||||
|
||||
#define VERSION "4.3 01-Jun-2006"
|
||||
#define VERSION "4.4 29-Nov-2006"
|
||||
#define MAX_PATTERN_COUNT 100
|
||||
|
||||
#if BUFSIZ > 8192
|
||||
@@ -65,7 +65,6 @@ typedef int BOOL;
|
||||
#define MBUFTHIRD 8192
|
||||
#endif
|
||||
|
||||
|
||||
/* Values for the "filenames" variable, which specifies options for file name
|
||||
output. The order is important; it is assumed that a file name is wanted for
|
||||
all values greater than FN_DEFAULT. */
|
||||
@@ -83,6 +82,10 @@ enum { DEE_READ, DEE_SKIP };
|
||||
#define PO_LINE_MATCH 0x0002
|
||||
#define PO_FIXED_STRINGS 0x0004
|
||||
|
||||
/* Line ending types */
|
||||
|
||||
enum { EL_LF, EL_CR, EL_CRLF, EL_ANY };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
@@ -100,8 +103,7 @@ static const char *jfriedl_prefix = "";
|
||||
static const char *jfriedl_postfix = "";
|
||||
#endif
|
||||
|
||||
static int endlinebyte = '\n'; /* Last byte of endline sequence */
|
||||
static int endlineextra = 0; /* Extra bytes for endline sequence */
|
||||
static int endlinetype;
|
||||
|
||||
static char *colour_string = (char *)"1;31";
|
||||
static char *colour_option = NULL;
|
||||
@@ -142,6 +144,7 @@ static BOOL number = FALSE;
|
||||
static BOOL only_matching = FALSE;
|
||||
static BOOL quiet = FALSE;
|
||||
static BOOL silent = FALSE;
|
||||
static BOOL utf8 = FALSE;
|
||||
|
||||
/* Structure for options and list of them */
|
||||
|
||||
@@ -219,6 +222,16 @@ static const char *prefix[] = {
|
||||
static const char *suffix[] = {
|
||||
"", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" };
|
||||
|
||||
/* UTF-8 tables - used only when the newline setting is "all". */
|
||||
|
||||
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
const char utf8_table4[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
@@ -470,6 +483,216 @@ return sys_errlist[n];
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find end of line *
|
||||
*************************************************/
|
||||
|
||||
/* The length of the endline sequence that is found is set via lenptr. This may
|
||||
be zero at the very end of the file if there is no line-ending sequence there.
|
||||
|
||||
Arguments:
|
||||
p current position in line
|
||||
endptr end of available data
|
||||
lenptr where to put the length of the eol sequence
|
||||
|
||||
Returns: pointer to the last byte of the line
|
||||
*/
|
||||
|
||||
static char *
|
||||
end_of_line(char *p, char *endptr, int *lenptr)
|
||||
{
|
||||
switch(endlinetype)
|
||||
{
|
||||
default: /* Just in case */
|
||||
case EL_LF:
|
||||
while (p < endptr && *p != '\n') p++;
|
||||
if (p < endptr)
|
||||
{
|
||||
*lenptr = 1;
|
||||
return p + 1;
|
||||
}
|
||||
*lenptr = 0;
|
||||
return endptr;
|
||||
|
||||
case EL_CR:
|
||||
while (p < endptr && *p != '\r') p++;
|
||||
if (p < endptr)
|
||||
{
|
||||
*lenptr = 1;
|
||||
return p + 1;
|
||||
}
|
||||
*lenptr = 0;
|
||||
return endptr;
|
||||
|
||||
case EL_CRLF:
|
||||
for (;;)
|
||||
{
|
||||
while (p < endptr && *p != '\r') p++;
|
||||
if (++p >= endptr)
|
||||
{
|
||||
*lenptr = 0;
|
||||
return endptr;
|
||||
}
|
||||
if (*p == '\n')
|
||||
{
|
||||
*lenptr = 2;
|
||||
return p + 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case EL_ANY:
|
||||
while (p < endptr)
|
||||
{
|
||||
int extra = 0;
|
||||
register int c = *((unsigned char *)p);
|
||||
|
||||
if (utf8 && c >= 0xc0)
|
||||
{
|
||||
int gcii, gcss;
|
||||
extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
gcss = 6*extra;
|
||||
c = (c & utf8_table3[extra]) << gcss;
|
||||
for (gcii = 1; gcii <= extra; gcii++)
|
||||
{
|
||||
gcss -= 6;
|
||||
c |= (p[gcii] & 0x3f) << gcss;
|
||||
}
|
||||
}
|
||||
|
||||
p += 1 + extra;
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 0x0a: /* LF */
|
||||
case 0x0b: /* VT */
|
||||
case 0x0c: /* FF */
|
||||
*lenptr = 1;
|
||||
return p;
|
||||
|
||||
case 0x0d: /* CR */
|
||||
if (p < endptr && *p == 0x0a)
|
||||
{
|
||||
*lenptr = 2;
|
||||
p++;
|
||||
}
|
||||
else *lenptr = 1;
|
||||
return p;
|
||||
|
||||
case 0x85: /* NEL */
|
||||
*lenptr = utf8? 2 : 1;
|
||||
return p;
|
||||
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
*lenptr = 3;
|
||||
return p;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} /* End of loop for ANY case */
|
||||
|
||||
*lenptr = 0; /* Must have hit the end */
|
||||
return endptr;
|
||||
} /* End of overall switch */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find start of previous line *
|
||||
*************************************************/
|
||||
|
||||
/* This is called when looking back for before lines to print.
|
||||
|
||||
Arguments:
|
||||
p start of the subsequent line
|
||||
startptr start of available data
|
||||
|
||||
Returns: pointer to the start of the previous line
|
||||
*/
|
||||
|
||||
static char *
|
||||
previous_line(char *p, char *startptr)
|
||||
{
|
||||
switch(endlinetype)
|
||||
{
|
||||
default: /* Just in case */
|
||||
case EL_LF:
|
||||
p--;
|
||||
while (p > startptr && p[-1] != '\n') p--;
|
||||
return p;
|
||||
|
||||
case EL_CR:
|
||||
p--;
|
||||
while (p > startptr && p[-1] != '\n') p--;
|
||||
return p;
|
||||
|
||||
case EL_CRLF:
|
||||
for (;;)
|
||||
{
|
||||
p -= 2;
|
||||
while (p > startptr && p[-1] != '\n') p--;
|
||||
if (p <= startptr + 1 || p[-2] == '\r') return p;
|
||||
}
|
||||
return p; /* But control should never get here */
|
||||
|
||||
case EL_ANY:
|
||||
if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--;
|
||||
if (utf8) while ((*p & 0xc0) == 0x80) p--;
|
||||
|
||||
while (p > startptr)
|
||||
{
|
||||
register int c;
|
||||
char *pp = p - 1;
|
||||
|
||||
if (utf8)
|
||||
{
|
||||
int extra = 0;
|
||||
while ((*pp & 0xc0) == 0x80) pp--;
|
||||
c = *((unsigned char *)pp);
|
||||
if (c >= 0xc0)
|
||||
{
|
||||
int gcii, gcss;
|
||||
extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
gcss = 6*extra;
|
||||
c = (c & utf8_table3[extra]) << gcss;
|
||||
for (gcii = 1; gcii <= extra; gcii++)
|
||||
{
|
||||
gcss -= 6;
|
||||
c |= (pp[gcii] & 0x3f) << gcss;
|
||||
}
|
||||
}
|
||||
}
|
||||
else c = *((unsigned char *)pp);
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 0x0a: /* LF */
|
||||
case 0x0b: /* VT */
|
||||
case 0x0c: /* FF */
|
||||
case 0x0d: /* CR */
|
||||
case 0x85: /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
return p;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
p = pp; /* Back one character */
|
||||
} /* End of loop for ANY case */
|
||||
|
||||
return startptr; /* Hit start of data */
|
||||
} /* End of overall switch */
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print the previous "after" lines *
|
||||
*************************************************/
|
||||
@@ -495,13 +718,13 @@ if (after_context > 0 && lastmatchnumber > 0)
|
||||
int count = 0;
|
||||
while (lastmatchrestart < endptr && count++ < after_context)
|
||||
{
|
||||
int ellength;
|
||||
char *pp = lastmatchrestart;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
|
||||
while (*pp != endlinebyte) pp++;
|
||||
fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra),
|
||||
stdout);
|
||||
lastmatchrestart = pp + 1;
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
lastmatchrestart = pp;
|
||||
}
|
||||
hyphenpending = TRUE;
|
||||
}
|
||||
@@ -558,7 +781,7 @@ way, the buffer is shifted left and re-filled. */
|
||||
|
||||
while (ptr < endptr)
|
||||
{
|
||||
int i;
|
||||
int i, endlinelength;
|
||||
int mrc = 0;
|
||||
BOOL match = FALSE;
|
||||
char *t = ptr;
|
||||
@@ -571,11 +794,10 @@ while (ptr < endptr)
|
||||
line. In multiline mode the PCRE_FIRSTLINE option is used for compiling, so
|
||||
that any match is constrained to be in the first line. */
|
||||
|
||||
linelength = 0;
|
||||
while (t < endptr && *t++ != endlinebyte) linelength++;
|
||||
t = end_of_line(t, endptr, &endlinelength);
|
||||
linelength = t - ptr - endlinelength;
|
||||
length = multiline? endptr - ptr : linelength;
|
||||
|
||||
|
||||
/* Extra processing for Jeffrey Friedl's debugging. */
|
||||
|
||||
#ifdef JFRIEDL_DEBUG
|
||||
@@ -706,13 +928,13 @@ while (ptr < endptr)
|
||||
|
||||
if (after_context > 0 && lastmatchnumber > 0)
|
||||
{
|
||||
int ellength;
|
||||
int linecount = 0;
|
||||
char *p = lastmatchrestart;
|
||||
|
||||
while (p < ptr && linecount < after_context)
|
||||
{
|
||||
while (*p != endlinebyte) p++;
|
||||
p++;
|
||||
p = end_of_line(p, ptr, &ellength);
|
||||
linecount++;
|
||||
}
|
||||
|
||||
@@ -725,10 +947,9 @@ while (ptr < endptr)
|
||||
char *pp = lastmatchrestart;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
|
||||
while (*pp != endlinebyte) pp++;
|
||||
fwrite(lastmatchrestart, 1, pp - lastmatchrestart +
|
||||
(1 + endlineextra), stdout);
|
||||
lastmatchrestart = pp + 1;
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
lastmatchrestart = pp;
|
||||
}
|
||||
if (lastmatchrestart != ptr) hyphenpending = TRUE;
|
||||
}
|
||||
@@ -754,8 +975,7 @@ while (ptr < endptr)
|
||||
linecount < before_context)
|
||||
{
|
||||
linecount++;
|
||||
p--;
|
||||
while (p > buffer && p[-1] != endlinebyte) p--;
|
||||
p = previous_line(p, buffer);
|
||||
}
|
||||
|
||||
if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted)
|
||||
@@ -763,12 +983,13 @@ while (ptr < endptr)
|
||||
|
||||
while (p < ptr)
|
||||
{
|
||||
int ellength;
|
||||
char *pp = p;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (number) fprintf(stdout, "%d-", linenumber - linecount--);
|
||||
while (*pp != endlinebyte) pp++;
|
||||
fwrite(p, 1, pp - p + (1 + endlineextra), stdout);
|
||||
p = pp + 1;
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
fwrite(p, 1, pp - p, stdout);
|
||||
p = pp;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -788,11 +1009,16 @@ while (ptr < endptr)
|
||||
|
||||
if (multiline)
|
||||
{
|
||||
int ellength;
|
||||
char *endmatch = ptr + offsets[1];
|
||||
t = ptr;
|
||||
while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; }
|
||||
while (endmatch < endptr && *endmatch != endlinebyte) endmatch++;
|
||||
linelength = endmatch - ptr;
|
||||
while (t < endmatch)
|
||||
{
|
||||
t = end_of_line(t, endptr, &ellength);
|
||||
if (t <= endmatch) linenumber++; else break;
|
||||
}
|
||||
endmatch = end_of_line(endmatch, endptr, &ellength);
|
||||
linelength = endmatch - ptr - ellength;
|
||||
}
|
||||
|
||||
/*** NOTE: Use only fwrite() to output the data line, so that binary
|
||||
@@ -824,9 +1050,7 @@ while (ptr < endptr)
|
||||
fprintf(stdout, "%c[00m", 0x1b);
|
||||
fwrite(ptr + offsets[1], 1, linelength - offsets[1], stdout);
|
||||
}
|
||||
else fwrite(ptr, 1, linelength, stdout);
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
else fwrite(ptr, 1, linelength + endlinelength, stdout);
|
||||
}
|
||||
|
||||
/* End of doing what has to be done for a match */
|
||||
@@ -836,13 +1060,13 @@ while (ptr < endptr)
|
||||
/* Remember where the last match happened for after_context. We remember
|
||||
where we are about to restart, and that line's number. */
|
||||
|
||||
lastmatchrestart = ptr + linelength + 1;
|
||||
lastmatchrestart = ptr + linelength + endlinelength;
|
||||
lastmatchnumber = linenumber + 1;
|
||||
}
|
||||
|
||||
/* Advance to after the newline and increment the line number. */
|
||||
|
||||
ptr += linelength + 1;
|
||||
ptr += linelength + endlinelength;
|
||||
linenumber++;
|
||||
|
||||
/* If we haven't yet reached the end of the file (the buffer is full), and
|
||||
@@ -1098,7 +1322,7 @@ switch(letter)
|
||||
case 'q': quiet = TRUE; break;
|
||||
case 'r': dee_action = dee_RECURSE; break;
|
||||
case 's': silent = TRUE; break;
|
||||
case 'u': options |= PCRE_UTF8; break;
|
||||
case 'u': options |= PCRE_UTF8; utf8 = TRUE; break;
|
||||
case 'v': invert = TRUE; break;
|
||||
case 'w': process_options |= PO_WORD_MATCH; break;
|
||||
case 'x': process_options |= PO_LINE_MATCH; break;
|
||||
@@ -1231,14 +1455,16 @@ compile_pattern(char *pattern, int options, char *filename, int count)
|
||||
{
|
||||
if ((process_options & PO_FIXED_STRINGS) != 0)
|
||||
{
|
||||
char *eop = pattern + strlen(pattern);
|
||||
char buffer[MBUFTHIRD];
|
||||
for(;;)
|
||||
{
|
||||
char *p = strchr(pattern, endlinebyte);
|
||||
if (p == NULL)
|
||||
int ellength;
|
||||
char *p = end_of_line(pattern, eop, &ellength);
|
||||
if (ellength == 0)
|
||||
return compile_single_pattern(pattern, options, filename, count);
|
||||
sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern);
|
||||
pattern = p + 1;
|
||||
sprintf(buffer, "%.*s", p - pattern - ellength, pattern);
|
||||
pattern = p;
|
||||
if (!compile_single_pattern(buffer, options, filename, count))
|
||||
return FALSE;
|
||||
}
|
||||
@@ -1267,7 +1493,9 @@ char *patterns[MAX_PATTERN_COUNT];
|
||||
const char *locale_from = "--locale";
|
||||
const char *error;
|
||||
|
||||
/* Set the default line ending value from the default in the PCRE library. */
|
||||
/* Set the default line ending value from the default in the PCRE library;
|
||||
"lf", "cr", "crlf", and "any" are supported. Anything else is treated as "lf".
|
||||
*/
|
||||
|
||||
(void)pcre_config(PCRE_CONFIG_NEWLINE, &i);
|
||||
switch(i)
|
||||
@@ -1275,6 +1503,7 @@ switch(i)
|
||||
default: newline = (char *)"lf"; break;
|
||||
case '\r': newline = (char *)"cr"; break;
|
||||
case ('\r' << 8) | '\n': newline = (char *)"crlf"; break;
|
||||
case -1: newline = (char *)"any"; break;
|
||||
}
|
||||
|
||||
/* Process the options */
|
||||
@@ -1565,16 +1794,22 @@ if (colour_option != NULL && strcmp(colour_option, "never") != 0)
|
||||
if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0)
|
||||
{
|
||||
pcre_options |= PCRE_NEWLINE_CR;
|
||||
endlinebyte = '\r';
|
||||
endlinetype = EL_CR;
|
||||
}
|
||||
else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0)
|
||||
{
|
||||
pcre_options |= PCRE_NEWLINE_LF;
|
||||
endlinetype = EL_LF;
|
||||
}
|
||||
else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0)
|
||||
{
|
||||
pcre_options |= PCRE_NEWLINE_CRLF;
|
||||
endlineextra = 1;
|
||||
endlinetype = EL_CRLF;
|
||||
}
|
||||
else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0)
|
||||
{
|
||||
pcre_options |= PCRE_NEWLINE_ANY;
|
||||
endlinetype = EL_ANY;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -78,7 +78,7 @@ static const int eint[] = {
|
||||
REG_BADPAT, /* unrecognized character after (?< */
|
||||
REG_BADPAT, /* lookbehind assertion is not fixed length */
|
||||
REG_BADPAT, /* malformed number or name after (?( */
|
||||
REG_BADPAT, /* conditional group containe more than two branches */
|
||||
REG_BADPAT, /* conditional group contains more than two branches */
|
||||
REG_BADPAT, /* assertion expected after (?( */
|
||||
REG_BADPAT, /* (?R or (?digits must be followed by ) */
|
||||
REG_ECTYPE, /* unknown POSIX class name */
|
||||
@@ -93,7 +93,7 @@ static const int eint[] = {
|
||||
REG_BADPAT, /* closing ) for (?C expected */
|
||||
REG_BADPAT, /* recursive call could loop indefinitely */
|
||||
REG_BADPAT, /* unrecognized character after (?P */
|
||||
REG_BADPAT, /* syntax error after (?P */
|
||||
REG_BADPAT, /* syntax error in subpattern name (missing terminator) */
|
||||
REG_BADPAT, /* two named subpatterns have the same name */
|
||||
REG_BADPAT, /* invalid UTF-8 string */
|
||||
REG_BADPAT, /* support for \P, \p, and \X has not been compiled */
|
||||
@@ -102,7 +102,13 @@ static const int eint[] = {
|
||||
REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */
|
||||
REG_BADPAT, /* too many named subpatterns (maximum 10,000) */
|
||||
REG_BADPAT, /* repeated subpattern is too long */
|
||||
REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */
|
||||
REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */
|
||||
REG_BADPAT, /* internal error: overran compiling workspace */
|
||||
REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */
|
||||
REG_BADPAT, /* DEFINE group contains more than one branch */
|
||||
REG_BADPAT, /* repeating a DEFINE group is not allowed */
|
||||
REG_INVARG, /* inconsistent NEWLINE options */
|
||||
REG_BADPAT /* \g is not followed followed by an (optionally braced) non-zero number */
|
||||
};
|
||||
|
||||
/* Table of texts corresponding to POSIX error codes */
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
Compatible Regular Expression library. It defines the things POSIX says should
|
||||
be there. I hope.
|
||||
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -44,10 +44,29 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <locale.h>
|
||||
#include <errno.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <sys/resource.h>
|
||||
|
||||
/* A number of things vary for Windows builds. Originally, pcretest opened its
|
||||
input and output without "b"; then I was told that "b" was needed in some
|
||||
environments, so it was added for release 5.0 to both the input and output. (It
|
||||
makes no difference on Unix-like systems.) Later I was told that it is wrong
|
||||
for the input on Windows. I've now abstracted the modes into two macros that
|
||||
are set here, to make it easier to fiddle with them, and removed "b" from the
|
||||
input mode under Windows. */
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
#include <io.h> /* For _setmode() */
|
||||
#include <fcntl.h> /* For _O_BINARY */
|
||||
#define INPUT_MODE "r"
|
||||
#define OUTPUT_MODE "wb"
|
||||
|
||||
#else
|
||||
#include <sys/time.h> /* These two includes are needed */
|
||||
#include <sys/resource.h> /* for setrlimit(). */
|
||||
#define INPUT_MODE "rb"
|
||||
#define OUTPUT_MODE "wb"
|
||||
#endif
|
||||
|
||||
|
||||
#define PCRE_SPY /* For Win32 build, import data, not export */
|
||||
|
||||
/* We include pcre_internal.h because we need the internal info for displaying
|
||||
@@ -74,10 +93,18 @@ symbols to prevent clashes. */
|
||||
|
||||
/* We also need the pcre_printint() function for printing out compiled
|
||||
patterns. This function is in a separate file so that it can be included in
|
||||
pcre_compile.c when that module is compiled with debugging enabled. */
|
||||
pcre_compile.c when that module is compiled with debugging enabled.
|
||||
|
||||
The definition of the macro PRINTABLE, which determines whether to print an
|
||||
output character as-is or as a hex value when showing compiled patterns, is
|
||||
contained in this file. We uses it here also, in cases when the locale has not
|
||||
been explicitly changed, so as to get consistent output from systems that
|
||||
differ in their output from isprint() even in the "C" locale. */
|
||||
|
||||
#include "pcre_printint.src"
|
||||
|
||||
#define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c))
|
||||
|
||||
|
||||
/* It is possible to compile this test program without including support for
|
||||
testing the POSIX interface, though this is not available via the standard
|
||||
@@ -103,6 +130,8 @@ function (define NOINFOCHECK). */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* This is the default loop count for timing. */
|
||||
|
||||
#define LOOPREPEAT 500000
|
||||
|
||||
/* Static variables */
|
||||
@@ -114,6 +143,7 @@ static int callout_extra;
|
||||
static int callout_fail_count;
|
||||
static int callout_fail_id;
|
||||
static int first_callout;
|
||||
static int locale_set = 0;
|
||||
static int show_malloc;
|
||||
static int use_utf8;
|
||||
static size_t gotten_store;
|
||||
@@ -157,6 +187,7 @@ uschar *here = start;
|
||||
for (;;)
|
||||
{
|
||||
int rlen = buffer_size - (here - buffer);
|
||||
|
||||
if (rlen > 1000)
|
||||
{
|
||||
int dlen;
|
||||
@@ -213,7 +244,7 @@ return NULL; /* Control never gets here */
|
||||
|
||||
/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
|
||||
around with conditional compilation, just do the job by hand. It is only used
|
||||
for unpicking the -o argument, so just keep it simple.
|
||||
for unpicking arguments, so just keep it simple.
|
||||
|
||||
Arguments:
|
||||
str string to be converted
|
||||
@@ -311,6 +342,8 @@ Arguments:
|
||||
Returns: number of characters placed in the buffer
|
||||
*/
|
||||
|
||||
#if !defined NOUTF8
|
||||
|
||||
static int
|
||||
ord2utf8(int cvalue, uschar *utf8bytes)
|
||||
{
|
||||
@@ -327,6 +360,8 @@ for (j = i; j > 0; j--)
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
@@ -353,16 +388,19 @@ while (length-- > 0)
|
||||
{
|
||||
length -= rc - 1;
|
||||
p += rc;
|
||||
if (c < 256 && isprint(c))
|
||||
if (PRINTHEX(c))
|
||||
{
|
||||
if (f != NULL) fprintf(f, "%c", c);
|
||||
yield++;
|
||||
}
|
||||
else
|
||||
{
|
||||
int n;
|
||||
if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
|
||||
yield += n;
|
||||
int n = 4;
|
||||
if (f != NULL) fprintf(f, "\\x{%02x}", c);
|
||||
yield += (n <= 0x000000ff)? 2 :
|
||||
(n <= 0x00000fff)? 3 :
|
||||
(n <= 0x0000ffff)? 4 :
|
||||
(n <= 0x000fffff)? 5 : 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -371,7 +409,8 @@ while (length-- > 0)
|
||||
|
||||
/* Not UTF-8, or malformed UTF-8 */
|
||||
|
||||
if (isprint(c = *(p++)))
|
||||
c = *p++;
|
||||
if (PRINTHEX(c))
|
||||
{
|
||||
if (f != NULL) fprintf(f, "%c", c);
|
||||
yield++;
|
||||
@@ -614,7 +653,7 @@ return count;
|
||||
*************************************************/
|
||||
|
||||
/* This is used both at compile and run-time to check for <xxx> escapes, where
|
||||
xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match.
|
||||
xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.
|
||||
|
||||
Arguments:
|
||||
p points after the leading '<'
|
||||
@@ -629,12 +668,45 @@ check_newline(uschar *p, FILE *f)
|
||||
if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
|
||||
if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
|
||||
if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
|
||||
if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;
|
||||
fprintf(f, "Unknown newline type at: <%s\n", p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Usage function *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
printf("Usage: pcretest [options] [<input> [<output>]]\n");
|
||||
printf(" -b show compiled code (bytecode)\n");
|
||||
printf(" -C show PCRE compile-time options and exit\n");
|
||||
printf(" -d debug: show compiled code and information (-b and -i)\n");
|
||||
#if !defined NODFA
|
||||
printf(" -dfa force DFA matching for all subjects\n");
|
||||
#endif
|
||||
printf(" -help show usage information\n");
|
||||
printf(" -i show information about compiled patterns\n"
|
||||
" -m output memory used information\n"
|
||||
" -o <n> set size of offsets vector to <n>\n");
|
||||
#if !defined NOPOSIX
|
||||
printf(" -p use POSIX interface\n");
|
||||
#endif
|
||||
printf(" -q quiet: do not output PCRE version number at start\n");
|
||||
printf(" -S <n> set stack size to <n> megabytes\n");
|
||||
printf(" -s output store (memory) used information\n"
|
||||
" -t time compilation and execution\n");
|
||||
printf(" -t <n> time compilation and execution, repeating <n> times\n");
|
||||
printf(" -tm time execution (matching) only\n");
|
||||
printf(" -tm <n> time execution (matching) only, repeating <n> times\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main Program *
|
||||
*************************************************/
|
||||
@@ -650,6 +722,7 @@ int options = 0;
|
||||
int study_options = 0;
|
||||
int op = 1;
|
||||
int timeit = 0;
|
||||
int timeitm = 0;
|
||||
int showinfo = 0;
|
||||
int showstore = 0;
|
||||
int quiet = 0;
|
||||
@@ -681,16 +754,19 @@ buffer = (unsigned char *)malloc(buffer_size);
|
||||
dbuffer = (unsigned char *)malloc(buffer_size);
|
||||
pbuffer = (unsigned char *)malloc(buffer_size);
|
||||
|
||||
/* The outfile variable is static so that new_malloc can use it. The _setmode()
|
||||
stuff is some magic that I don't understand, but which apparently does good
|
||||
things in Windows. It's related to line terminations. */
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
_setmode( _fileno( stdout ), 0x8000 );
|
||||
#endif /* defined(_WIN32) || defined(WIN32) */
|
||||
/* The outfile variable is static so that new_malloc can use it. */
|
||||
|
||||
outfile = stdout;
|
||||
|
||||
/* The following _setmode() stuff is some Windows magic that tells its runtime
|
||||
library to translate CRLF into a single LF character. At least, that's what
|
||||
I've been told: never having used Windows I take this all on trust. Originally
|
||||
it set 0x8000, but then I was advised that _O_BINARY was better. */
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
_setmode( _fileno( stdout ), _O_BINARY );
|
||||
#endif
|
||||
|
||||
/* Scan options */
|
||||
|
||||
while (argc > 1 && argv[op][0] == '-')
|
||||
@@ -699,8 +775,8 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
|
||||
if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
|
||||
showstore = 1;
|
||||
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
|
||||
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
|
||||
else if (strcmp(argv[op], "-b") == 0) debug = 1;
|
||||
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
|
||||
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
|
||||
#if !defined NODFA
|
||||
@@ -713,11 +789,25 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
op++;
|
||||
argc--;
|
||||
}
|
||||
else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0)
|
||||
{
|
||||
int both = argv[op][2] == 0;
|
||||
int temp;
|
||||
if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr),
|
||||
*endptr == 0))
|
||||
{
|
||||
timeitm = temp;
|
||||
op++;
|
||||
argc--;
|
||||
}
|
||||
else timeitm = LOOPREPEAT;
|
||||
if (both) timeit = timeitm;
|
||||
}
|
||||
else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
|
||||
((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
|
||||
*endptr == 0))
|
||||
{
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
printf("PCRE: -S not supported on this OS\n");
|
||||
exit(1);
|
||||
#else
|
||||
@@ -749,7 +839,8 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
printf(" %sUnicode properties support\n", rc? "" : "No ");
|
||||
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
|
||||
printf(" Newline sequence is %s\n", (rc == '\r')? "CR" :
|
||||
(rc == '\n')? "LF" : "CRLF");
|
||||
(rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
|
||||
(rc == -1)? "ANY" : "???");
|
||||
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
|
||||
printf(" Internal link size = %d\n", rc);
|
||||
(void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
|
||||
@@ -762,24 +853,16 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
printf(" Match recursion uses %s\n", rc? "stack" : "heap");
|
||||
exit(0);
|
||||
}
|
||||
else if (strcmp(argv[op], "-help") == 0 ||
|
||||
strcmp(argv[op], "--help") == 0)
|
||||
{
|
||||
usage();
|
||||
goto EXIT;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("** Unknown or malformed option %s\n", argv[op]);
|
||||
printf("Usage: pcretest [options] [<input> [<output>]]\n");
|
||||
printf(" -C show PCRE compile-time options and exit\n");
|
||||
printf(" -d debug: show compiled code; implies -i\n");
|
||||
#if !defined NODFA
|
||||
printf(" -dfa force DFA matching for all subjects\n");
|
||||
#endif
|
||||
printf(" -i show information about compiled pattern\n"
|
||||
" -m output memory used information\n"
|
||||
" -o <n> set size of offsets vector to <n>\n");
|
||||
#if !defined NOPOSIX
|
||||
printf(" -p use POSIX interface\n");
|
||||
#endif
|
||||
printf(" -S <n> set stack size to <n> megabytes\n");
|
||||
printf(" -s output store (memory) used information\n"
|
||||
" -t time compilation and execution\n");
|
||||
usage();
|
||||
yield = 1;
|
||||
goto EXIT;
|
||||
}
|
||||
@@ -803,7 +886,7 @@ if (offsets == NULL)
|
||||
|
||||
if (argc > 1)
|
||||
{
|
||||
infile = fopen(argv[op], "rb");
|
||||
infile = fopen(argv[op], INPUT_MODE);
|
||||
if (infile == NULL)
|
||||
{
|
||||
printf("** Failed to open %s\n", argv[op]);
|
||||
@@ -814,7 +897,7 @@ if (argc > 1)
|
||||
|
||||
if (argc > 2)
|
||||
{
|
||||
outfile = fopen(argv[op+1], "wb");
|
||||
outfile = fopen(argv[op+1], OUTPUT_MODE);
|
||||
if (outfile == NULL)
|
||||
{
|
||||
printf("** Failed to open %s\n", argv[op+1]);
|
||||
@@ -859,7 +942,7 @@ while (!done)
|
||||
int do_showinfo = showinfo;
|
||||
int do_showrest = 0;
|
||||
int do_flip = 0;
|
||||
int erroroffset, len, delimiter;
|
||||
int erroroffset, len, delimiter, poffset;
|
||||
|
||||
use_utf8 = 0;
|
||||
|
||||
@@ -969,6 +1052,7 @@ while (!done)
|
||||
}
|
||||
|
||||
pp = p;
|
||||
poffset = p - buffer;
|
||||
|
||||
for(;;)
|
||||
{
|
||||
@@ -989,6 +1073,11 @@ while (!done)
|
||||
if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
|
||||
}
|
||||
|
||||
/* The buffer may have moved while being extended; reset the start of data
|
||||
pointer to the correct relative point in the buffer. */
|
||||
|
||||
p = buffer + poffset;
|
||||
|
||||
/* If the first character after the delimiter is backslash, make
|
||||
the pattern end with backslash. This is purely to provide a way
|
||||
of testing for the error message when a pattern ends with backslash. */
|
||||
@@ -1020,6 +1109,7 @@ while (!done)
|
||||
|
||||
case '+': do_showrest = 1; break;
|
||||
case 'A': options |= PCRE_ANCHORED; break;
|
||||
case 'B': do_debug = 1; break;
|
||||
case 'C': options |= PCRE_AUTO_CALLOUT; break;
|
||||
case 'D': do_debug = do_showinfo = 1; break;
|
||||
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
|
||||
@@ -1042,14 +1132,16 @@ while (!done)
|
||||
|
||||
case 'L':
|
||||
ppp = pp;
|
||||
/* The '\r' test here is so that it works on Windows */
|
||||
while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
|
||||
/* The '\r' test here is so that it works on Windows. */
|
||||
/* The '0' test is just in case this is an unterminated line. */
|
||||
while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
|
||||
*ppp = 0;
|
||||
if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
|
||||
{
|
||||
fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
|
||||
goto SKIP_DATA;
|
||||
}
|
||||
locale_set = 1;
|
||||
tables = pcre_maketables();
|
||||
pp = ppp;
|
||||
break;
|
||||
@@ -1116,19 +1208,19 @@ while (!done)
|
||||
#endif /* !defined NOPOSIX */
|
||||
|
||||
{
|
||||
if (timeit)
|
||||
if (timeit > 0)
|
||||
{
|
||||
register int i;
|
||||
clock_t time_taken;
|
||||
clock_t start_time = clock();
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
for (i = 0; i < timeit; i++)
|
||||
{
|
||||
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
|
||||
if (re != NULL) free(re);
|
||||
}
|
||||
time_taken = clock() - start_time;
|
||||
fprintf(outfile, "Compile time %.3f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
||||
fprintf(outfile, "Compile time %.4f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)timeit) /
|
||||
(double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
@@ -1180,17 +1272,17 @@ while (!done)
|
||||
|
||||
if (do_study)
|
||||
{
|
||||
if (timeit)
|
||||
if (timeit > 0)
|
||||
{
|
||||
register int i;
|
||||
clock_t time_taken;
|
||||
clock_t start_time = clock();
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
for (i = 0; i < timeit; i++)
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
time_taken = clock() - start_time;
|
||||
if (extra != NULL) free(extra);
|
||||
fprintf(outfile, " Study time %.3f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
||||
fprintf(outfile, " Study time %.4f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)timeit) /
|
||||
(double)CLOCKS_PER_SEC);
|
||||
}
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
@@ -1233,6 +1325,12 @@ while (!done)
|
||||
|
||||
SHOW_INFO:
|
||||
|
||||
if (do_debug)
|
||||
{
|
||||
fprintf(outfile, "------------------------------------------------------------------\n");
|
||||
pcre_printint(re, outfile);
|
||||
}
|
||||
|
||||
if (do_showinfo)
|
||||
{
|
||||
unsigned long int get_options, all_options;
|
||||
@@ -1243,12 +1341,6 @@ while (!done)
|
||||
int nameentrysize, namecount;
|
||||
const uschar *nametable;
|
||||
|
||||
if (do_debug)
|
||||
{
|
||||
fprintf(outfile, "------------------------------------------------------------------\n");
|
||||
pcre_printint(re, outfile);
|
||||
}
|
||||
|
||||
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
|
||||
new_info(re, NULL, PCRE_INFO_SIZE, &size);
|
||||
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
|
||||
@@ -1327,7 +1419,7 @@ while (!done)
|
||||
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
|
||||
((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
|
||||
|
||||
switch (get_options & PCRE_NEWLINE_CRLF)
|
||||
switch (get_options & PCRE_NEWLINE_BITS)
|
||||
{
|
||||
case PCRE_NEWLINE_CR:
|
||||
fprintf(outfile, "Forced newline sequence: CR\n");
|
||||
@@ -1341,6 +1433,10 @@ while (!done)
|
||||
fprintf(outfile, "Forced newline sequence: CRLF\n");
|
||||
break;
|
||||
|
||||
case PCRE_NEWLINE_ANY:
|
||||
fprintf(outfile, "Forced newline sequence: ANY\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -1358,7 +1454,7 @@ while (!done)
|
||||
int ch = first_char & 255;
|
||||
const char *caseless = ((first_char & REQ_CASELESS) == 0)?
|
||||
"" : " (caseless)";
|
||||
if (isprint(ch))
|
||||
if (PRINTHEX(ch))
|
||||
fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
|
||||
else
|
||||
fprintf(outfile, "First char = %d%s\n", ch, caseless);
|
||||
@@ -1373,7 +1469,7 @@ while (!done)
|
||||
int ch = need_char & 255;
|
||||
const char *caseless = ((need_char & REQ_CASELESS) == 0)?
|
||||
"" : " (caseless)";
|
||||
if (isprint(ch))
|
||||
if (PRINTHEX(ch))
|
||||
fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
|
||||
else
|
||||
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
|
||||
@@ -1409,7 +1505,7 @@ while (!done)
|
||||
fprintf(outfile, "\n ");
|
||||
c = 2;
|
||||
}
|
||||
if (isprint(i) && i != ' ')
|
||||
if (PRINTHEX(i) && i != ' ')
|
||||
{
|
||||
fprintf(outfile, "%c ", i);
|
||||
c += 2;
|
||||
@@ -1468,6 +1564,7 @@ while (!done)
|
||||
strerror(errno));
|
||||
}
|
||||
else fprintf(outfile, "Study data written to %s\n", to_file);
|
||||
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
@@ -1866,7 +1963,7 @@ while (!done)
|
||||
|
||||
for (;; gmatched++) /* Loop for /g or /G */
|
||||
{
|
||||
if (timeit)
|
||||
if (timeitm > 0)
|
||||
{
|
||||
register int i;
|
||||
clock_t time_taken;
|
||||
@@ -1876,7 +1973,7 @@ while (!done)
|
||||
if (all_use_dfa || use_dfa)
|
||||
{
|
||||
int workspace[1000];
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
for (i = 0; i < timeitm; i++)
|
||||
count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
|
||||
options | g_notempty, use_offsets, use_size_offsets, workspace,
|
||||
sizeof(workspace)/sizeof(int));
|
||||
@@ -1884,13 +1981,13 @@ while (!done)
|
||||
else
|
||||
#endif
|
||||
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
for (i = 0; i < timeitm; i++)
|
||||
count = pcre_exec(re, extra, (char *)bptr, len,
|
||||
start_offset, options | g_notempty, use_offsets, use_size_offsets);
|
||||
|
||||
time_taken = clock() - start_time;
|
||||
fprintf(outfile, "Execute time %.3f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
||||
fprintf(outfile, "Execute time %.4f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)timeitm) /
|
||||
(double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
@@ -1966,7 +2063,28 @@ while (!done)
|
||||
|
||||
if (count >= 0)
|
||||
{
|
||||
int i;
|
||||
int i, maxcount;
|
||||
|
||||
#if !defined NODFA
|
||||
if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
|
||||
#endif
|
||||
maxcount = use_size_offsets/3;
|
||||
|
||||
/* This is a check against a lunatic return value. */
|
||||
|
||||
if (count > maxcount)
|
||||
{
|
||||
fprintf(outfile,
|
||||
"** PCRE error: returned count %d is too big for offset size %d\n",
|
||||
count, use_size_offsets);
|
||||
count = use_size_offsets/3;
|
||||
if (do_g || do_G)
|
||||
{
|
||||
fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G');
|
||||
do_g = do_G = FALSE; /* Break g/G loop */
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < count * 2; i += 2)
|
||||
{
|
||||
if (use_offsets[i] < 0)
|
||||
@@ -2165,6 +2283,7 @@ while (!done)
|
||||
{
|
||||
new_free((void *)tables);
|
||||
setlocale(LC_CTYPE, "C");
|
||||
locale_set = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
3
ext/pcre/pcrelib/testdata/grepinput
vendored
3
ext/pcre/pcrelib/testdata/grepinput
vendored
@@ -593,7 +593,8 @@ aaaaa2
|
||||
ffffffffff
|
||||
|
||||
This is a line before the binary zero.
|
||||
This line contains a binary zero here >This is a line after the binary zero.
|
||||
This line contains a binary zero here > | ||||