1
0
mirror of https://github.com/php/php-src.git synced 2026-03-29 19:52:20 +02:00

upgrade pcre to version 7.0

This commit is contained in:
Nuno Lopes
2007-02-09 19:48:47 +00:00
parent e6d69595af
commit b3e66c616d
65 changed files with 10215 additions and 4653 deletions

1
NEWS
View File

@@ -2,6 +2,7 @@ PHP NEWS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
?? ??? 2007, PHP 5.2.2
- Upgraded SQLite 3 to version 3.3.12 (Ilia)
- Upgraded PCRE to version 7.0 (Nuno)
- Add --ri switch to CLI which allows to check extension information. (Marcus)
- Fixed bug #40410 (ext/posix does not compile on MacOS 10.3.9). (Tony)
- Fixed bug #39836 (SplObjectStorage empty after unserialize). (Marcus)

View File

@@ -5,8 +5,8 @@ ARG_WITH("pcre-regex", "Perl Compatible Regular Expressions", "yes");
if (PHP_PCRE_REGEX == "yes") {
EXTENSION("pcre", "php_pcre.c", PHP_PCRE_REGEX_SHARED,
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DNO_RECURSE -Iext/pcre/pcrelib");
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucp_searchfuncs.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -DNO_RECURSE -Iext/pcre/pcrelib");
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucp_searchfuncs.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
ADD_DEF_FILE("ext\\pcre\\php_pcre.def");
AC_DEFINE('HAVE_BUNDLED_PCRE', 1, 'Using bundled PCRE library');

View File

@@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,
if test "$PHP_PCRE_REGEX" != "no"; then
if test "$PHP_PCRE_REGEX" = "yes"; then
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib)
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib)
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/])
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])

View File

@@ -4,7 +4,7 @@ PCRE LICENCE
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.

View File

@@ -1,6 +1,279 @@
ChangeLog for PCRE
------------------
Version 7.0 19-Dec-06
---------------------
1. Fixed a signed/unsigned compiler warning in pcre_compile.c, shown up by
moving to gcc 4.1.1.
2. The -S option for pcretest uses setrlimit(); I had omitted to #include
sys/time.h, which is documented as needed for this function. It doesn't
seem to matter on Linux, but it showed up on some releases of OS X.
3. It seems that there are systems where bytes whose values are greater than
127 match isprint() in the "C" locale. The "C" locale should be the
default when a C program starts up. In most systems, only ASCII printing
characters match isprint(). This difference caused the output from pcretest
to vary, making some of the tests fail. I have changed pcretest so that:
(a) When it is outputting text in the compiled version of a pattern, bytes
other than 32-126 are always shown as hex escapes.
(b) When it is outputting text that is a matched part of a subject string,
it does the same, unless a different locale has been set for the match
(using the /L modifier). In this case, it uses isprint() to decide.
4. Fixed a major bug that caused incorrect computation of the amount of memory
required for a compiled pattern when options that changed within the
pattern affected the logic of the preliminary scan that determines the
length. The relevant options are -x, and -i in UTF-8 mode. The result was
that the computed length was too small. The symptoms of this bug were
either the PCRE error "internal error: code overflow" from pcre_compile(),
or a glibc crash with a message such as "pcretest: free(): invalid next
size (fast)". Examples of patterns that provoked this bug (shown in
pcretest format) are:
/(?-x: )/x
/(?x)(?-x: \s*#\s*)/
/((?i)[\x{c0}])/8
/(?i:[\x{c0}])/8
HOWEVER: Change 17 below makes this fix obsolete as the memory computation
is now done differently.
5. Applied patches from Google to: (a) add a QuoteMeta function to the C++
wrapper classes; (b) implement a new function in the C++ scanner that is
more efficient than the old way of doing things because it avoids levels of
recursion in the regex matching; (c) add a paragraph to the documentation
for the FullMatch() function.
6. The escape sequence \n was being treated as whatever was defined as
"newline". Not only was this contrary to the documentation, which states
that \n is character 10 (hex 0A), but it also went horribly wrong when
"newline" was defined as CRLF. This has been fixed.
7. In pcre_dfa_exec.c the value of an unsigned integer (the variable called c)
was being set to -1 for the "end of line" case (supposedly a value that no
character can have). Though this value is never used (the check for end of
line is "zero bytes in current character"), it caused compiler complaints.
I've changed it to 0xffffffff.
8. In pcre_version.c, the version string was being built by a sequence of
C macros that, in the event of PCRE_PRERELEASE being defined as an empty
string (as it is for production releases) called a macro with an empty
argument. The C standard says the result of this is undefined. The gcc
compiler treats it as an empty string (which was what was wanted) but it is
reported that Visual C gives an error. The source has been hacked around to
avoid this problem.
9. On the advice of a Windows user, included <io.h> and <fcntl.h> in Windows
builds of pcretest, and changed the call to _setmode() to use _O_BINARY
instead of 0x8000. Made all the #ifdefs test both _WIN32 and WIN32 (not all
of them did).
10. Originally, pcretest opened its input and output without "b"; then I was
told that "b" was needed in some environments, so it was added for release
5.0 to both the input and output. (It makes no difference on Unix-like
systems.) Later I was told that it is wrong for the input on Windows. I've
now abstracted the modes into two macros, to make it easier to fiddle with
them, and removed "b" from the input mode under Windows.
11. Added pkgconfig support for the C++ wrapper library, libpcrecpp.
12. Added -help and --help to pcretest as an official way of being reminded
of the options.
13. Removed some redundant semicolons after macro calls in pcrecpparg.h.in
and pcrecpp.cc because they annoy compilers at high warning levels.
14. A bit of tidying/refactoring in pcre_exec.c in the main bumpalong loop.
15. Fixed an occurrence of == in configure.ac that should have been = (shell
scripts are not C programs :-) and which was not noticed because it works
on Linux.
16. pcretest is supposed to handle any length of pattern and data line (as one
line or as a continued sequence of lines) by extending its input buffer if
necessary. This feature was broken for very long pattern lines, leading to
a string of junk being passed to pcre_compile() if the pattern was longer
than about 50K.
17. I have done a major re-factoring of the way pcre_compile() computes the
amount of memory needed for a compiled pattern. Previously, there was code
that made a preliminary scan of the pattern in order to do this. That was
OK when PCRE was new, but as the facilities have expanded, it has become
harder and harder to keep it in step with the real compile phase, and there
have been a number of bugs (see for example, 4 above). I have now found a
cunning way of running the real compile function in a "fake" mode that
enables it to compute how much memory it would need, while actually only
ever using a few hundred bytes of working memory and without too many
tests of the mode. This should make future maintenance and development
easier. A side effect of this work is that the limit of 200 on the nesting
depth of parentheses has been removed (though this was never a serious
limitation, I suspect). However, there is a downside: pcre_compile() now
runs more slowly than before (30% or more, depending on the pattern). I
hope this isn't a big issue. There is no effect on runtime performance.
18. Fixed a minor bug in pcretest: if a pattern line was not terminated by a
newline (only possible for the last line of a file) and it was a
pattern that set a locale (followed by /Lsomething), pcretest crashed.
19. Added additional timing features to pcretest. (1) The -tm option now times
matching only, not compiling. (2) Both -t and -tm can be followed, as a
separate command line item, by a number that specifies the number of
repeats to use when timing. The default is 50000; this gives better
precision, but takes uncomfortably long for very large patterns.
20. Extended pcre_study() to be more clever in cases where a branch of a
subpattern has no definite first character. For example, (a*|b*)[cd] would
previously give no result from pcre_study(). Now it recognizes that the
first character must be a, b, c, or d.
21. There was an incorrect error "recursive call could loop indefinitely" if
a subpattern (or the entire pattern) that was being tested for matching an
empty string contained only one non-empty item after a nested subpattern.
For example, the pattern (?>\x{100}*)\d(?R) provoked this error
incorrectly, because the \d was being skipped in the check.
22. The pcretest program now has a new pattern option /B and a command line
option -b, which is equivalent to adding /B to every pattern. This causes
it to show the compiled bytecode, without the additional information that
-d shows. The effect of -d is now the same as -b with -i (and similarly, /D
is the same as /B/I).
23. A new optimization is now able automatically to treat some sequences such
as a*b as a*+b. More specifically, if something simple (such as a character
or a simple class like \d) has an unlimited quantifier, and is followed by
something that cannot possibly match the quantified thing, the quantifier
is automatically "possessified".
24. A recursive reference to a subpattern whose number was greater than 39
went wrong under certain circumstances in UTF-8 mode. This bug could also
have affected the operation of pcre_study().
25. Realized that a little bit of performance could be had by replacing
(c & 0xc0) == 0xc0 with c >= 0xc0 when processing UTF-8 characters.
26. Timing data from pcretest is now shown to 4 decimal places instead of 3.
27. Possessive quantifiers such as a++ were previously implemented by turning
them into atomic groups such as ($>a+). Now they have their own opcodes,
which improves performance. This includes the automatically created ones
from 23 above.
28. A pattern such as (?=(\w+))\1: which simulates an atomic group using a
lookahead was broken if it was not anchored. PCRE was mistakenly expecting
the first matched character to be a colon. This applied both to named and
numbered groups.
29. The ucpinternal.h header file was missing its idempotency #ifdef.
30. I was sent a "project" file called libpcre.a.dev which I understand makes
building PCRE on Windows easier, so I have included it in the distribution.
31. There is now a check in pcretest against a ridiculously large number being
returned by pcre_exec() or pcre_dfa_exec(). If this happens in a /g or /G
loop, the loop is abandoned.
32. Forward references to subpatterns in conditions such as (?(2)...) where
subpattern 2 is defined later cause pcre_compile() to search forwards in
the pattern for the relevant set of parentheses. This search went wrong
when there were unescaped parentheses in a character class, parentheses
escaped with \Q...\E, or parentheses in a #-comment in /x mode.
33. "Subroutine" calls and backreferences were previously restricted to
referencing subpatterns earlier in the regex. This restriction has now
been removed.
34. Added a number of extra features that are going to be in Perl 5.10. On the
whole, these are just syntactic alternatives for features that PCRE had
previously implemented using the Python syntax or my own invention. The
other formats are all retained for compatibility.
(a) Named groups can now be defined as (?<name>...) or (?'name'...) as well
as (?P<name>...). The new forms, as well as being in Perl 5.10, are
also .NET compatible.
(b) A recursion or subroutine call to a named group can now be defined as
(?&name) as well as (?P>name).
(c) A backreference to a named group can now be defined as \k<name> or
\k'name' as well as (?P=name). The new forms, as well as being in Perl
5.10, are also .NET compatible.
(d) A conditional reference to a named group can now use the syntax
(?(<name>) or (?('name') as well as (?(name).
(e) A "conditional group" of the form (?(DEFINE)...) can be used to define
groups (named and numbered) that are never evaluated inline, but can be
called as "subroutines" from elsewhere. In effect, the DEFINE condition
is always false. There may be only one alternative in such a group.
(f) A test for recursion can be given as (?(R1).. or (?(R&name)... as well
as the simple (?(R). The condition is true only if the most recent
recursion is that of the given number or name. It does not search out
through the entire recursion stack.
(g) The escape \gN or \g{N} has been added, where N is a positive or
negative number, specifying an absolute or relative reference.
35. Tidied to get rid of some further signed/unsigned compiler warnings and
some "unreachable code" warnings.
36. Updated the Unicode property tables to Unicode version 5.0.0. Amongst other
things, this adds five new scripts.
37. Perl ignores orphaned \E escapes completely. PCRE now does the same.
There were also incompatibilities regarding the handling of \Q..\E inside
character classes, for example with patterns like [\Qa\E-\Qz\E] where the
hyphen was adjacent to \Q or \E. I hope I've cleared all this up now.
38. Like Perl, PCRE detects when an indefinitely repeated parenthesized group
matches an empty string, and forcibly breaks the loop. There were bugs in
this code in non-simple cases. For a pattern such as ^(a()*)* matched
against aaaa the result was just "a" rather than "aaaa", for example. Two
separate and independent bugs (that affected different cases) have been
fixed.
39. Refactored the code to abolish the use of different opcodes for small
capturing bracket numbers. This is a tidy that I avoided doing when I
removed the limit on the number of capturing brackets for 3.5 back in 2001.
The new approach is not only tidier, it makes it possible to reduce the
memory needed to fix the previous bug (38).
40. Implemented PCRE_NEWLINE_ANY to recognize any of the Unicode newline
sequences (http://unicode.org/unicode/reports/tr18/) as "newline" when
processing dot, circumflex, or dollar metacharacters, or #-comments in /x
mode.
41. Add \R to match any Unicode newline sequence, as suggested in the Unicode
report.
42. Applied patch, originally from Ari Pollak, modified by Google, to allow
copy construction and assignment in the C++ wrapper.
43. Updated pcregrep to support "--newline=any". In the process, I fixed a
couple of bugs that could have given wrong results in the "--newline=crlf"
case.
44. Added a number of casts and did some reorganization of signed/unsigned int
variables following suggestions from Dair Grant. Also renamed the variable
"this" as "item" because it is a C++ keyword.
45. Arranged for dftables to add
#include "pcre_internal.h"
to pcre_chartables.c because without it, gcc 4.x may remove the array
definition from the final binary if PCRE is built into a static library and
dead code stripping is activated.
46. For an unanchored pattern, if a match attempt fails at the start of a
newline sequence, and the newline setting is CRLF or ANY, and the next two
characters are CRLF, advance by two characters instead of one.
Version 6.7 04-Jul-06
---------------------

View File

@@ -4,7 +4,7 @@ PCRE LICENCE
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.

View File

@@ -1,6 +1,36 @@
News about PCRE releases
------------------------
Release 7.0 23-Nov-06
---------------------
This release has a new major number because there have been some internal
upheavals to facilitate the addition of new optimizations and other facilities,
and to make subsequent maintenance and extension easier. Compilation is likely
to be a bit slower, but there should be no major effect on runtime performance.
Previously compiled patterns are NOT upwards compatible with this release. If
you have saved compiled patterns from a previous release, you will have to
re-compile them. Important changes that are visible to users are:
1. The Unicode property tables have been updated to Unicode 5.0.0, which adds
some more scripts.
2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline
sequence as a newline.
3. The \R escape matches a single Unicode newline sequence as a single unit.
4. New features that will appear in Perl 5.10 are now in PCRE. These include
alternative Perl syntax for named parentheses, and Perl syntax for
recursion.
5. The C++ wrapper interface has been extended by the addition of a
QuoteMeta function and the ability to allow copy construction and
assignment.
For a complete list of changes, see the ChangeLog file.
Release 6.7 04-Jul-06
---------------------

View File

@@ -22,7 +22,7 @@ The following are generic comments about building PCRE. The interspersed
indented commands are suggestions from Mark Tetrode as to which commands you
might use on a Windows system to build a static library.
(1) Copy or rename the file config.in as config.h, and change the macros that
(1) Copy or rename the file config.h.in as config.h, and change the macros that
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
Unfortunately, because of the way Unix autoconf works, the default setting has
to be 0. You may also want to make changes to other macros in config.h. In
@@ -31,7 +31,7 @@ the NEWLINE macro. The default is to use '\n', thereby using whatever value
your compiler gives to '\n'.
rem Mark Tetrode's commands
copy config.in config.h
copy config.h.in config.h
rem Use write, because notepad cannot handle UNIX files. Change values.
write config.h
@@ -56,6 +56,7 @@ character tables and writes them to that file.
pcre_globals.c
pcre_info.c
pcre_maketables.c
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
@@ -93,10 +94,10 @@ pcre and pcreposix libraries when linking.
cl /F0x400000 pcretest.c pcre.lib pcreposix.lib
(6) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. You must use the
-i option when checking testinput2. Note that the supplied files are in Unix
format, with just LF characters as line terminators. You may need to edit them
to change this if your system uses a different convention.
that the output matches the corresponding testoutput files. Note that the
supplied files are in Unix format, with just LF characters as line terminators.
You may need to edit them to change this if your system uses a different
convention.
rem Mark Tetrode's commands
pcretest testdata\testinput1 testdata\myoutput1
@@ -135,6 +136,17 @@ If you have a system without "configure" but where you can use a Makefile, edit
Makefile.in to create Makefile, substituting suitable values for the variables
at the head of the file.
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
Some of the core BCC libraries have a version of PCRE from 1998 built in,
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
version mismatch. I'm including an easy workaround below, if you'd like to
include it in the non-unix instructions:
When linking a project with BCC5.5, pcre.lib must be included before any of
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
line.
Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
contributed by Paul Sokolovsky. These environments are Mingw32
(http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin

View File

@@ -118,13 +118,13 @@ library. You can read more about them in the pcrebuild man page.
property table); only the basic two-letter properties such as Lu are
supported.
. You can build PCRE to recognize either CR or LF or the sequence CRLF as
indicating the end of a line. Whatever you specify at build time is the
default; the caller of PCRE can change the selection at run time. The default
newline indicator is a single LF character (the Unix standard). You can
specify the default newline indicator by adding --newline-is-cr or
--newline-is-lf or --newline-is-crlf to the "configure" command,
respectively.
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
of the Unicode newline sequences as indicating the end of a line. Whatever
you specify at build time is the default; the caller of PCRE can change the
selection at run time. The default newline indicator is a single LF character
(the Unix standard). You can specify the default newline indicator by adding
--newline-is-cr or --newline-is-lf or --newline-is-crlf or --newline-is-any
to the "configure" command, respectively.
. When called via the POSIX interface, PCRE uses malloc() to get additional
storage for processing capturing parentheses if there are more than 10 of
@@ -283,7 +283,7 @@ to the values of CC and CFLAGS.
Using HP's ANSI C++ compiler (aCC)
----------------------------------
Unless C++ support is disabled by specifiying the "--disable-cpp" option of the
Unless C++ support is disabled by specifying the "--disable-cpp" option of the
"configure" script, you *must* include the "-AA" option in the CXXFLAGS
environment variable in order for the C++ components to compile correctly.
@@ -305,8 +305,8 @@ PCRE in the same way as for Unix systems.
PCRE has been compiled on Windows systems and on Macintoshes, but I don't know
the details because I don't use those systems. It should be straightforward to
build PCRE on any system that has a Standard C compiler, because it uses only
Standard C functions.
build PCRE on any system that has a Standard C compiler and library, because it
uses only Standard C functions.
Testing PCRE
@@ -325,15 +325,15 @@ NON-UNIX-USE.
The RunTest script runs the pcretest test program (which is documented in its
own man page) on each of the testinput files (in the testdata directory) in
turn, and compares the output with the contents of the corresponding testoutput
file. A file called testtry is used to hold the main output from pcretest
files. A file called testtry is used to hold the main output from pcretest
(testsavedregex is also used as a working file). To run pcretest on just one of
the test files, give its number as an argument to RunTest, for example:
RunTest 2
The first file can also be fed directly into the perltest script to check that
Perl gives the same results. The only difference you should see is in the first
few lines, where the Perl version is given instead of the PCRE version.
The first test file can also be fed directly into the perltest script to check
that Perl gives the same results. The only difference you should see is in the
first few lines, where the Perl version is given instead of the PCRE version.
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
@@ -442,6 +442,7 @@ The distribution should contain the following files:
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
pcre_maketables.c )
pcre_newline.c )
pcre_ord2utf8.c )
pcre_refcount.c )
pcre_study.c )
@@ -525,4 +526,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
June 2006
November 2006

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -86,7 +86,16 @@ fprintf(f,
fprintf(f,
"This file contains the default tables for characters with codes less than\n"
"128 (ASCII characters). These tables are used when no external tables are\n"
"passed to PCRE. */\n\n"
"passed to PCRE.\n\n");
fprintf(f,
"The following #include is present because without it gcc 4.x may remove\n"
"the array definition from the final binary if PCRE is built into a static\n"
"library and dead code stripping is activated. This leads to link errors.\n"
"Pulling in the header ensures that the array gets flagged as \"someone\n"
"outside this compilation unit might reference this\" and so it will always\n"
"be supplied to the linker. */\n\n"
"#include \"pcre_internal.h\"\n\n");
fprintf(f,
"const unsigned char _pcre_default_tables[] = {\n\n"
"/* This table is a lower casing table. */\n\n");

View File

@@ -16,10 +16,11 @@ not operate by backtracking, as the original Henry Spencer code and current
Perl code does, but instead checked all possibilities simultaneously by keeping
a list of current states and checking all of them as it advanced through the
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm". When the pattern was all used up, all remaining states were
possible matches, and the one matching the longest subset of the subject string
was chosen. This did not necessarily maximize the individual wild portions of
the pattern, as is expected in Unix and Perl-style regular expressions.
algorithm", though it was not a traditional Finite State Machine (FSM). When
the pattern was all used up, all remaining states were possible matches, and
the one matching the longest subset of the subject string was chosen. This did
not necessarily maximize the individual wild portions of the pattern, as is
expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
@@ -41,14 +42,38 @@ unrelated to those mentioned above), I tried at first to invent an algorithm
that used an amount of store bounded by a multiple of the number of characters
in the pattern, to save on compiling time. However, because of the greater
complexity in Perl regular expressions, I couldn't do this. In any case, a
first pass through the pattern is needed, for a number of reasons. PCRE works
by running a very degenerate first pass to calculate a maximum store size, and
then a second pass to do the real compile - which may use a bit less than the
predicted amount of store. The idea is that this is going to turn out faster
because the first pass is degenerate and the second pass can just store stuff
straight into the vector, which it knows is big enough. It does make the
compiling functions bigger, of course, but they have become quite big anyway to
handle all the Perl stuff.
first pass through the pattern is helpful for other reasons.
Computing the memory requirement: how it was
--------------------------------------------
Up to and including release 6.7, PCRE worked by running a very degenerate first
pass to calculate a maximum store size, and then a second pass to do the real
compile - which might use a bit less than the predicted amount of memory. The
idea was that this would turn out faster than the Henry Spencer code because
the first pass is degenerate and the second pass can just store stuff straight
into the vector, which it knows is big enough.
Computing the memory requirement: how it is
-------------------------------------------
By the time I was working on a potential 6.8 release, the degenerate first pass
had become very complicated and hard to maintain. Indeed one of the early
things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
I had a flash of inspiration as to how I could run the real compile function in
a "fake" mode that enables it to compute how much memory it would need, while
actually only ever using a few hundred bytes of working memory, and without too
many tests of the mode that might slow it down. So I re-factored the compiling
functions to work this way. This got rid of about 600 lines of source. It
should make future maintenance and development easier. As this was such a major
change, I never released 6.8, instead upping the number to 7.0 (other quite
major changes are also present in the 7.0 release).
A side effect of this work is that the previous limit of 200 on the nesting
depth of parentheses was removed. However, there is a downside: pcre_compile()
runs more slowly than before (30% or more, depending on the pattern) because it
is doing a full analysis of the pattern. My hope is that this is not a big
issue.
Traditional matching function
-----------------------------
@@ -70,6 +95,12 @@ intreprets the same compiled pattern data as pcre_exec(); however, not all the
facilities are available, and those that are do not always work in quite the
same way. See the user documentation for details.
The algorithm that is used for pcre_dfa_exec() is not a traditional FSM,
because it may have a number of states active at one time. More work would be
needed at compile time to produce a traditional FSM where only one state is
ever active at once. I believe some other regex matchers work this way.
Format of compiled patterns
---------------------------
@@ -79,10 +110,12 @@ item is either implicit in the opcode or contained in the data bytes that
follow it.
In many cases below "two-byte" data values are specified. This is in fact just
a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
a default when the number is an offset within the compiled pattern. PCRE can be
compiled to use 3-byte or 4-byte values for these offsets (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options.
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options. "Two-byte" data values that are counts (e.g. for
quantifiers) are always just two bytes.
A list of all the opcodes follows:
@@ -109,6 +142,7 @@ These items are all just one byte long
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_EXTUNI match an extended Unicode character
OP_ANYNL match any Unicode newline sequence
Repeating single characters
@@ -119,23 +153,28 @@ following opcodes:
OP_STAR
OP_MINSTAR
OP_POSSTAR
OP_PLUS
OP_MINPLUS
OP_POSPLUS
OP_QUERY
OP_MINQUERY
OP_POSQUERY
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Each is followed by
the character that is to be repeated. Other repeats make use of
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
their names are possessive versions. Each is followed by the character that is
to be repeated. Other repeats make use of
OP_UPTO
OP_MINUPTO
OP_POSUPTO
OP_EXACT
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO).
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
Repeating character types
@@ -147,12 +186,16 @@ byte. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
OP_TYPEPOSSTAR
OP_TYPEPLUS
OP_TYPEMINPLUS
OP_TYPEPOSPLUS
OP_TYPEQUERY
OP_TYPEMINQUERY
OP_TYPEPOSQUERY
OP_TYPEUPTO
OP_TYPEMINUPTO
OP_TYPEPOSUPTO
OP_TYPEEXACT
@@ -216,9 +259,10 @@ OP_REF is followed by two bytes containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This applies to
OP_CLASS and OP_REF. In both cases, the repeat information follows the base
item. The matching code looks at the following opcode to see if it is one of
Single-character classes are handled specially (see above). This section
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
the base item. The matching code looks at the following opcode to see if it is
one of
OP_CRSTAR
OP_CRMINSTAR
@@ -230,7 +274,9 @@ item. The matching code looks at the following opcode to see if it is one of
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts.
four bytes of data, comprising the minimum and maximum repeat counts. There are
no special possessive opcodes for these repeats; a possessive repeat is
compiled into an atomic group.
Brackets and alternation
@@ -239,29 +285,25 @@ Brackets and alternation
A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
speakers, including myself, can be round, square, curly, or pointy. Hence this
usage.]
[Note for North Americans: "bracket" to some English speakers, including
myself, can be round, square, curly, or pointy. Hence this usage.]
Originally PCRE was limited to 99 capturing brackets (so as not to use up all
the opcodes). From release 3.5, there is no limit. What happens is that the
first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as
above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the
first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket
number. This opcode is ignored while matching, but is fished out when handling
the bracket itself. (They could have all been done like this, but I was making
minimal changes.)
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
capturing brackets and it used a different opcode for each one. From release
3.5, the limit was removed by putting the bracket number into the data for
higher-numbered brackets. From release 7.0 all capturing brackets are handled
this way, using the single opcode OP_CBRA.
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
next alternative OP_ALT or, if there aren't any branches, to the matching
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
the next one, or to the OP_KET opcode.
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
number immediately follows the offset, always as a 2-byte item.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching OP_BRA opcode.
positive number) the offset back to the matching bracket opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
@@ -276,7 +318,14 @@ as appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
before each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group
has the same number.
When a repeated subpattern has an unbounded upper limit, it is checked to see
whether it could match an empty string. If this is the case, the opcode in the
final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
that it needs to check for matching an empty string when it hits OP_KETRMIN or
OP_KETRMAX, and if so, to break the loop.
Assertions
@@ -292,22 +341,27 @@ each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
Once-only subpatterns
---------------------
Once-only (atomic) subpatterns
------------------------------
These are also just like other subpatterns, but they start with the opcode
OP_ONCE.
OP_ONCE. The check for matching an empty string in an unbounded repeat is
handled entirely at runtime, so there is just this one opcode.
Conditional subpatterns
-----------------------
These are like other subpatterns, but they start with the opcode OP_COND. If
These are like other subpatterns, but they start with the opcode OP_COND, or
OP_SCOND for one that might match an empty string in an unbounded repeat. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by two bytes containing the
reference number. If the condition is "in recursion" (coded as "(?(R)"), the
same scheme is used, with a "reference number" of 0xffff. Otherwise, a
conditional subpattern always starts with one of the assertions.
reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in
recursion of group x" (coded as "(?(Rx)"), the group number is stored at the
start of the subpattern using the opcode OP_RREF, and a value of zero for "the
whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it
has no associated data). Otherwise, a conditional subpattern always starts with
one of the assertions.
Recursion
@@ -345,4 +399,4 @@ at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
June 2006
November 2006

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
Copyright (c) 1997-2005 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,7 @@ POSSIBILITY OF SUCH DAMAGE.
#ifndef _PCRE_H
#define _PCRE_H
#include "php_compat.h"
/* The current PCRE version information. */
@@ -54,10 +54,10 @@ and libpcre.pc. The values are not put into configure.ac and substituted here
cannot run ./configure. As it now stands, this file need not be edited in that
circumstance. */
#define PCRE_MAJOR 6
#define PCRE_MINOR 7
#define PCRE_MAJOR 7
#define PCRE_MINOR 0
#define PCRE_PRERELEASE
#define PCRE_DATE 04-Jul-2006
#define PCRE_DATE 18-Dec-2006
/* Win32 uses DLL by default; it needs special stuff for exported functions
when building PCRE. */
@@ -120,6 +120,7 @@ extern "C" {
#define PCRE_NEWLINE_CR 0x00100000
#define PCRE_NEWLINE_LF 0x00200000
#define PCRE_NEWLINE_CRLF 0x00300000
#define PCRE_NEWLINE_ANY 0x00400000
/* Exec-time and get/set-time error codes */
@@ -127,7 +128,8 @@ extern "C" {
#define PCRE_ERROR_NULL (-2)
#define PCRE_ERROR_BADOPTION (-3)
#define PCRE_ERROR_BADMAGIC (-4)
#define PCRE_ERROR_UNKNOWN_NODE (-5)
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
#define PCRE_ERROR_NOMEMORY (-6)
#define PCRE_ERROR_NOSUBSTRING (-7)
#define PCRE_ERROR_MATCHLIMIT (-8)
@@ -144,6 +146,8 @@ extern "C" {
#define PCRE_ERROR_DFA_WSSIZE (-19)
#define PCRE_ERROR_DFA_RECURSE (-20)
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22)
#define PCRE_ERROR_BADNEWLINE (-23)
/* Request types for pcre_fullinfo() */

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -436,7 +436,6 @@ pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
}

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -51,6 +51,18 @@ differently, and global variables are not used (see pcre.in). */
#ifndef VPCOMPAT
/**************************************************************************
This code used to be here for use when compiling as a C++ library. However,
according to Dair Grant it is not needed: "
Including 'extern "C"' in the declaration generates an "initialized and
declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h,
which includes pcre.h, which declares these prototypes within an extern "C" {}
block, we shouldn't need the prefix here.
So, from Release 7.0 I have cut this out.
#ifdef __cplusplus
extern "C" void *(*pcre_malloc)(size_t) = malloc;
extern "C" void (*pcre_free)(void *) = free;
@@ -58,12 +70,13 @@ extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
extern "C" void (*pcre_stack_free)(void *) = free;
extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
#else
**************************************************************************/
void *(*pcre_malloc)(size_t) = malloc;
void (*pcre_free)(void *) = free;
void *(*pcre_stack_malloc)(size_t) = malloc;
void (*pcre_stack_free)(void *) = free;
int (*pcre_callout)(pcre_callout_block *) = NULL;
#endif
#endif
/* End of pcre_globals.c */

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -54,12 +54,16 @@ functions whose names all begin with "_pcre_". */
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
inline, and there are *still* stupid compilers about that don't like indented
pre-processor statements, or at least there were when I first wrote this. After
all, it had only been about 10 years then... */
all, it had only been about 10 years then...
It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
be absolutely sure we get our version. */
#undef DPRINTF
#ifdef DEBUG
#define DPRINTF(p) printf p
#else
#define DPRINTF(p) /*nothing*/
#define DPRINTF(p) /* Nothing */
#endif
@@ -118,13 +122,48 @@ Unix, where it is defined in sys/types, so use "uschar" instead. */
typedef unsigned char uschar;
/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The
following macro is used to package up testing for newlines. NLBLOCK is defined
in the various modules to indicate in which datablock the parameters exist. */
/* This is an unsigned int value that no character can ever have. UTF-8
characters only go up to 0x7fffffff (though Unicode doesn't go beyond
0x0010ffff). */
#define NOTACHAR 0xffffffff
/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
and "all" at present). The following macros are used to package up testing for
newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to
indicate in which datablock the parameters exist, and what the start/end of
string field names are. */
#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
/* This macro checks for a newline at the given position */
#define IS_NEWLINE(p) \
((p)[0] == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) < NLBLOCK->PSEND && \
_pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \
) \
: \
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
(p)[0] == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
) \
)
/* This macro checks for a newline immediately preceding the given position */
#define WAS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) > NLBLOCK->PSSTART && \
_pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \
) \
: \
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
(p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
) \
)
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
with a custom type. This makes it possible, for example, to allow pcre_exec()
@@ -282,7 +321,7 @@ we know we are in UTF-8 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
if ((c & 0xc0) == 0xc0) \
if (c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
@@ -300,7 +339,7 @@ pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
if (utf8 && (c & 0xc0) == 0xc0) \
if (utf8 && c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
@@ -318,7 +357,7 @@ know we are in UTF-8 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if ((c & 0xc0) == 0xc0) \
if (c >= 0xc0) \
{ \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
@@ -334,7 +373,7 @@ know we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
if (utf8 && (c & 0xc0) == 0xc0) \
if (utf8 && c >= 0xc0) \
{ \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
@@ -351,7 +390,7 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
if ((c & 0xc0) == 0xc0) \
if (c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
@@ -404,20 +443,21 @@ bits. */
/* Masks for identifying the public options that are permitted at compile
time, run time, or study time, respectively. */
#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY)
#define PUBLIC_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
PCRE_DUPNAMES|PCRE_NEWLINE_BITS)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
PCRE_PARTIAL|PCRE_NEWLINE_BITS)
#define PUBLIC_DFA_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \
PCRE_NEWLINE_LF)
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
@@ -449,9 +489,7 @@ typedef int BOOL;
#define FALSE 0
#define TRUE 1
/* Escape items that are just an encoding of a particular data value. Note that
ESC_n is defined as yet another macro, which is set in config.h to either \n
(the default) or \r (which some people want). */
/* Escape items that are just an encoding of a particular data value. */
#ifndef ESC_e
#define ESC_e 27
@@ -462,7 +500,7 @@ ESC_n is defined as yet another macro, which is set in config.h to either \n
#endif
#ifndef ESC_n
#define ESC_n NEWLINE
#define ESC_n '\n'
#endif
#ifndef ESC_r
@@ -501,21 +539,28 @@ value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
corresponds to "." rather than an escape sequence. The final one must be
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
tests in the code for an escape greater than ESC_b and less than ESC_Z to
detect the types that may be repeated. These are the types that consume
characters. If any new escapes are put in between that don't consume a
ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc).
There are two tests in the code for an escape greater than ESC_b and less than
ESC_Z to detect the types that may be repeated. These are the types that
consume characters. If any new escapes are put in between that don't consume a
character, that code will have to change. */
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
ESC_Q, ESC_REF };
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z,
ESC_E, ESC_Q, ESC_k, ESC_REF };
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
Note that whenever this list is updated, the two macro definitions that follow
must also be updated to match. */
To keep stored, compiled patterns compatible, new opcodes should be added
immediately before OP_BRA, where (since release 7.0) a gap is left for this
purpose.
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
that follow must also be updated to match. There is also a table called
"coptable" in pcre_dfa_exec.c that must be updated. */
enum {
OP_END, /* 0 End of pattern */
@@ -536,111 +581,123 @@ enum {
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 13 \P (not Unicode property) */
OP_PROP, /* 14 \p (Unicode property) */
OP_EXTUNI, /* 15 \X (extended Unicode sequence */
OP_EODN, /* 16 End of data or \n at end of data: \Z. */
OP_EOD, /* 17 End of data: \z */
OP_ANYNL, /* 15 \R (any newline sequence) */
OP_EXTUNI, /* 16 \X (extended Unicode sequence */
OP_EODN, /* 17 End of data or \n at end of data: \Z. */
OP_EOD, /* 18 End of data: \z */
OP_OPT, /* 18 Set runtime options */
OP_CIRC, /* 19 Start of line - varies with multiline switch */
OP_DOLL, /* 20 End of line - varies with multiline switch */
OP_CHAR, /* 21 Match one character, casefully */
OP_CHARNC, /* 22 Match one character, caselessly */
OP_NOT, /* 23 Match one character, not the following one */
OP_OPT, /* 19 Set runtime options */
OP_CIRC, /* 20 Start of line - varies with multiline switch */
OP_DOLL, /* 21 End of line - varies with multiline switch */
OP_CHAR, /* 22 Match one character, casefully */
OP_CHARNC, /* 23 Match one character, caselessly */
OP_NOT, /* 24 Match one character, not the following one */
OP_STAR, /* 24 The maximizing and minimizing versions of */
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
OP_PLUS, /* 26 the minimizing one second. */
OP_MINPLUS, /* 27 This first set applies to single characters */
OP_QUERY, /* 28 */
OP_MINQUERY, /* 29 */
OP_UPTO, /* 30 From 0 to n matches */
OP_MINUPTO, /* 31 */
OP_EXACT, /* 32 Exactly n matches */
OP_STAR, /* 25 The maximizing and minimizing versions of */
OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */
OP_PLUS, /* 27 the minimizing one second. */
OP_MINPLUS, /* 28 This first set applies to single characters.*/
OP_QUERY, /* 29 */
OP_MINQUERY, /* 30 */
OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
OP_NOTPLUS, /* 35 the minimizing one second. */
OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
OP_NOTQUERY, /* 37 */
OP_NOTMINQUERY, /* 38 */
OP_NOTUPTO, /* 39 From 0 to n matches */
OP_NOTMINUPTO, /* 40 */
OP_NOTEXACT, /* 41 Exactly n matches */
OP_UPTO, /* 31 From 0 to n matches */
OP_MINUPTO, /* 32 */
OP_EXACT, /* 33 Exactly n matches */
OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 47 */
OP_TYPEUPTO, /* 48 From 0 to n matches */
OP_TYPEMINUPTO, /* 49 */
OP_TYPEEXACT, /* 50 Exactly n matches */
OP_POSSTAR, /* 34 Possessified star */
OP_POSPLUS, /* 35 Possessified plus */
OP_POSQUERY, /* 36 Posesssified query */
OP_POSUPTO, /* 37 Possessified upto */
OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 53 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
OP_CRQUERY, /* 55 These are for character classes and back refs */
OP_CRMINQUERY, /* 56 */
OP_CRRANGE, /* 57 These are different to the three sets above. */
OP_CRMINRANGE, /* 58 */
OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */
OP_NOTPLUS, /* 40 the minimizing one second. They must be in */
OP_NOTMINPLUS, /* 41 exactly the same order as those above. */
OP_NOTQUERY, /* 42 This set applies to "not" single characters. */
OP_NOTMINQUERY, /* 43 */
OP_CLASS, /* 59 Match a character class, chars < 256 only */
OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
OP_NOTUPTO, /* 44 From 0 to n matches */
OP_NOTMINUPTO, /* 45 */
OP_NOTEXACT, /* 46 Exactly n matches */
OP_NOTPOSSTAR, /* 47 Possessified versions */
OP_NOTPOSPLUS, /* 48 */
OP_NOTPOSQUERY, /* 49 */
OP_NOTPOSUPTO, /* 50 */
OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */
OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 55 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 56 */
OP_TYPEUPTO, /* 57 From 0 to n matches */
OP_TYPEMINUPTO, /* 58 */
OP_TYPEEXACT, /* 59 Exactly n matches */
OP_TYPEPOSSTAR, /* 60 Possessified versions */
OP_TYPEPOSPLUS, /* 61 */
OP_TYPEPOSQUERY, /* 62 */
OP_TYPEPOSUPTO, /* 63 */
OP_CRSTAR, /* 64 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 66 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */
OP_CRQUERY, /* 68 These are for character classes and back refs */
OP_CRMINQUERY, /* 69 */
OP_CRRANGE, /* 70 These are different to the three sets above. */
OP_CRMINRANGE, /* 71 */
OP_CLASS, /* 72 Match a character class, chars < 256 only */
OP_NCLASS, /* 73 Same, but the bitmap was created from a negative
class - the difference is relevant only when a UTF-8
character > 255 is encountered. */
OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */
OP_REF, /* 62 Match a back reference */
OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 64 Call out to external function if provided */
OP_REF, /* 75 Match a back reference */
OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 77 Call out to external function if provided */
OP_ALT, /* 65 Start of alternation */
OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 67 These two must remain together and in this */
OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
OP_ALT, /* 78 Start of alternation */
OP_KET, /* 79 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 80 These two must remain together and in this */
OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */
/* The assertions must come before ONCE and COND */
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/
OP_ASSERT, /* 69 Positive lookahead */
OP_ASSERT_NOT, /* 70 Negative lookahead */
OP_ASSERTBACK, /* 71 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 82 Positive lookahead */
OP_ASSERT_NOT, /* 83 Negative lookahead */
OP_ASSERTBACK, /* 84 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */
OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */
/* ONCE and COND must come after the assertions, with ONCE first, as there's
a test for >= ONCE for a subpattern that isn't an assertion. */
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
as there's a test for >= ONCE for a subpattern that isn't an assertion. */
OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
OP_COND, /* 75 Conditional group */
OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
OP_ONCE, /* 87 Atomic group */
OP_BRA, /* 88 Start of non-capturing bracket */
OP_CBRA, /* 89 Start of capturing bracket */
OP_COND, /* 90 Conditional group */
OP_BRAZERO, /* 77 These two must remain together and in this */
OP_BRAMINZERO, /* 78 order. */
/* These three must follow the previous three, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
than can fit into an opcode. */
OP_SBRA, /* 91 Start of non-capturing bracket, check empty */
OP_SCBRA, /* 92 Start of capturing bracket, check empty */
OP_SCOND, /* 93 Conditional group, check empty */
OP_BRA /* 80 This and greater values are used for brackets that
extract substrings up to EXTRACT_BASIC_MAX. After
that, use is made of OP_BRANUMBER. */
OP_CREF, /* 94 Used to hold a capture number as condition */
OP_RREF, /* 95 Used to hold a recursion number as condition */
OP_DEF, /* 96 The DEFINE condition */
OP_BRAZERO, /* 97 These two must remain together and in this */
OP_BRAMINZERO /* 98 order. */
};
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
character sequences easier. */
/* The highest extraction number before we have to start using additional
bytes. (Originally PCRE didn't have support for extraction counts highter than
this number.) The value is limited by the number of opcodes left after OP_BRA,
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
opcodes. */
#define EXTRACT_BASIC_MAX 100
/* This macro defines textual names for all the opcodes. These are used only
for debugging. The macro is referenced only in pcre_printint.c. */
@@ -648,17 +705,21 @@ for debugging. The macro is referenced only in pcre_printint.c. */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
"notprop", "prop", "extuni", \
"notprop", "prop", "anynl", "extuni", \
"\\Z", "\\z", \
"Opt", "^", "$", "char", "charnc", "not", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", \
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
"Brazero", "Braminzero", "Branumber", "Bra"
"AssertB", "AssertB not", "Reverse", \
"Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero"
/* This macro defines the length of fixed length operations in the compiled
@@ -674,7 +735,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, /* End */ \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
1, 1, /* Any, Anybyte */ \
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
2, /* Char - the minimum length */ \
2, /* Charnc - the minimum length */ \
@@ -682,12 +743,15 @@ in UTF-8 mode. The code that uses this table must know about such things. */
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
/* Negative single-char repeats - only for chars < 256 */ \
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
4, 4, 4, /* NOT upto, minupto, exact */ \
2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
/* Positive type repeats */ \
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
4, 4, 4, /* Type upto, minupto, exact */ \
2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
/* Character class & ref repeats */ \
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
5, 5, /* CRRANGE, CRMINRANGE */ \
@@ -706,17 +770,22 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE, /* Assert behind */ \
1+LINK_SIZE, /* Assert behind not */ \
1+LINK_SIZE, /* Reverse */ \
1+LINK_SIZE, /* Once */ \
1+LINK_SIZE, /* ONCE */ \
1+LINK_SIZE, /* BRA */ \
3+LINK_SIZE, /* CBRA */ \
1+LINK_SIZE, /* COND */ \
1+LINK_SIZE, /* SBRA */ \
3+LINK_SIZE, /* SCBRA */ \
1+LINK_SIZE, /* SCOND */ \
3, /* CREF */ \
3, /* RREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
3, /* BRANUMBER */ \
1+LINK_SIZE /* BRA */ \
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
#define CREF_RECURSE 0xffff
#define RREF_ANY 0xffff
/* Error code numbers. They are given names so that they can more easily be
tracked. */
@@ -726,7 +795,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51 };
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -781,17 +850,23 @@ typedef struct compile_data {
const uschar *fcc; /* Points to case-flipping table */
const uschar *cbits; /* Points to character type table */
const uschar *ctypes; /* Points to table of type maps */
const uschar *start_workspace;/* The start of working space */
const uschar *start_code; /* The start of the compiled code */
const uschar *start_pattern; /* The start of the pattern */
const uschar *end_pattern; /* The end of the pattern */
uschar *hwm; /* High watermark of workspace */
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
int bracount; /* Count of capturing parens */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
int external_options; /* External (initial) options */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL nopartial; /* Set TRUE if partial won't work */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
int nltype; /* Newline type */
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed length */
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
@@ -824,6 +899,16 @@ This isn't used for a "normal" compilation of pcre. */
struct heapframe;
/* Structure for building a chain of data for holding the values of the subject
pointer at the start of each subpattern, so as to detect when an empty string
has been matched by a subpattern - to break infinite loops. */
typedef struct eptrblock {
struct eptrblock *epb_prev;
USPTR epb_saved_eptr;
} eptrblock;
/* Structure for passing "static" information around between the functions
doing traditional NFA matching, so that they are thread-safe. */
@@ -834,8 +919,9 @@ typedef struct match_data {
int *offset_vector; /* Offset vector */
int offset_end; /* One past the end */
int offset_max; /* The maximum usable for return data */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
int nltype; /* Newline type */
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed */
const uschar *lcc; /* Points to lower casing table */
const uschar *ctypes; /* Points to table of type maps */
BOOL offset_overflow; /* Set if too many extractions */
@@ -854,6 +940,8 @@ typedef struct match_data {
int end_offset_top; /* Highwater mark at end of match */
int capture_last; /* Most recent capture number */
int start_offset; /* The start offset value */
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
struct heapframe *thisframe; /* Used only when compiling for no recursion */
@@ -869,8 +957,9 @@ typedef struct dfa_match_data {
const uschar *tables; /* Character tables */
int moptions; /* Match options */
int poptions; /* Pattern options */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
int nltype; /* Newline type */
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed */
void *callout_data; /* To pass back to callouts */
} dfa_match_data;
@@ -941,13 +1030,17 @@ extern const uschar _pcre_OP_lengths[];
one of the exported public functions. They have to be "external" in the C
sense, but are not part of the PCRE public API. */
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
extern int _pcre_ucp_othercase(const int);
extern int _pcre_valid_utf8(const uschar *, int);
extern BOOL _pcre_xclass(int, const uschar *);
extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *,
BOOL);
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
extern unsigned int _pcre_ucp_othercase(const unsigned int);
extern int _pcre_valid_utf8(const uschar *, int);
extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *,
BOOL);
extern BOOL _pcre_xclass(int, const uschar *);
#endif

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -130,7 +130,7 @@ for (i = 0; i < 256; i++)
meta-character, which in this sense is any character that terminates a run
of data characters. */
if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
*p++ = x;
}

View File

@@ -0,0 +1,135 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains internal functions for testing newlines when more than
one kind of newline is to be recognized. When a newline is found, its length is
returned. In principle, we could implement several newline "types", each
referring to a different set of newline characters. At present, PCRE supports
only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL,
so for now the type isn't passed into the functions. It can easily be added
later if required. The full list of Unicode newline characters is taken from
http://unicode.org/unicode/reports/tr18/. */
#include "pcre_internal.h"
/*************************************************
* Check for newline at given position *
*************************************************/
/* It is guaranteed that the initial value of ptr is less than the end of the
string that is being processed.
Arguments:
ptr pointer to possible newline
endptr pointer to the end of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
Returns: TRUE or FALSE
*/
BOOL
_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr,
BOOL utf8)
{
int c;
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
switch(c)
{
case 0x000a: /* LF */
case 0x000b: /* VT */
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
}
}
/*************************************************
* Check for newline at previous position *
*************************************************/
/* It is guaranteed that the initial value of ptr is greater than the start of
the string that is being processed.
Arguments:
ptr pointer to possible newline
startptr pointer to the start of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
Returns: TRUE or FALSE
*/
BOOL
_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr,
BOOL utf8)
{
int c;
ptr--;
if (utf8)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
else c = *ptr;
switch(c)
{
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
return TRUE; /* LF */
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
}
}
/* End of pcre_newline.c */

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -49,9 +49,19 @@ local functions. This source file is used in two places:
compiled regex for debugging purposes. */
/* Macro that decides whether a character should be output as a literal or in
hexadecimal. We don't use isprint() because that can vary from system to system
(even without the use of locales) and we want the output always to be the same,
for testing purposes. This macro is used in pcretest as well as in this file. */
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
/* The table of operator names. */
static const char *OP_names[] = { OP_NAME_LIST };
/*************************************************
* Print single- or multi-byte character *
*************************************************/
@@ -63,7 +73,7 @@ int c = *ptr;
if (!utf8 || (c & 0xc0) != 0xc0)
{
if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
return 0;
}
else
@@ -160,16 +170,6 @@ for(;;)
fprintf(f, "%3d ", (int)(code - codestart));
if (*code >= OP_BRA)
{
if (*code - OP_BRA > EXTRACT_BASIC_MAX)
fprintf(f, "%3d Bra extra\n", GET(code, 1));
else
fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
code += _pcre_OP_lengths[OP_BRA];
continue;
}
switch(*code)
{
case OP_END:
@@ -203,6 +203,14 @@ for(;;)
fprintf(f, "\n");
continue;
case OP_CBRA:
case OP_SCBRA:
fprintf(f, "%3d %s %d", GET(code, 1), OP_names[*code],
GET2(code, 1+LINK_SIZE));
break;
case OP_BRA:
case OP_SBRA:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_ALT:
@@ -213,33 +221,45 @@ for(;;)
case OP_ASSERTBACK_NOT:
case OP_ONCE:
case OP_COND:
case OP_SCOND:
case OP_REVERSE:
fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
break;
case OP_BRANUMBER:
printf("%3d %s", GET2(code, 1), OP_names[*code]);
case OP_CREF:
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
break;
case OP_CREF:
if (GET2(code, 1) == CREF_RECURSE)
fprintf(f, " Cond recurse");
case OP_RREF:
c = GET2(code, 1);
if (c == RREF_ANY)
fprintf(f, " Cond recurse any");
else
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
fprintf(f, " Cond recurse %d", c);
break;
case OP_DEF:
fprintf(f, " Cond def");
break;
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
fprintf(f, " ");
if (*code >= OP_TYPESTAR)
{
@@ -257,17 +277,20 @@ for(;;)
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
fprintf(f, " ");
extra = print_char(f, code+3, utf8);
fprintf(f, "{");
if (*code != OP_EXACT) fprintf(f, ",");
if (*code != OP_EXACT) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_MINUPTO) fprintf(f, "?");
else if (*code == OP_POSUPTO) fprintf(f, "+");
break;
case OP_TYPEEXACT:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
fprintf(f, " %s", OP_names[code[3]]);
if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
{
@@ -278,20 +301,26 @@ for(;;)
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
break;
case OP_NOT:
if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
c = code[1];
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
else fprintf(f, " [^\\x%02x]", c);
break;
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPOSSTAR:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
case OP_NOTPOSQUERY:
c = code[1];
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
else fprintf(f, " [^\\x%02x]", c);
fprintf(f, "%s", OP_names[*code]);
break;
@@ -299,11 +328,14 @@ for(;;)
case OP_NOTEXACT:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
case OP_NOTPOSUPTO:
c = code[3];
if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
else fprintf(f, " [^\\x%02x]{", c);
if (*code != OP_NOTEXACT) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_NOTMINUPTO) fprintf(f, "?");
else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
break;
case OP_RECURSE:
@@ -363,12 +395,14 @@ for(;;)
for (j = i+1; j < 256; j++)
if ((ccode[j/8] & (1 << (j&7))) == 0) break;
if (i == '-' || i == ']') fprintf(f, "\\");
if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
if (PRINTABLE(i)) fprintf(f, "%c", i);
else fprintf(f, "\\x%02x", i);
if (--j > i)
{
if (j != i + 1) fprintf(f, "-");
if (j == '-' || j == ']') fprintf(f, "\\");
if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
if (PRINTABLE(j)) fprintf(f, "%c", j);
else fprintf(f, "\\x%02x", j);
}
i = j;
}

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -43,6 +43,7 @@ Scanner::Scanner()
input_(data_),
skip_(NULL),
should_skip_(false),
skip_repeat_(false),
save_comments_(false),
comments_(NULL),
comments_offset_(0) {
@@ -53,6 +54,7 @@ Scanner::Scanner(const string& in)
input_(data_),
skip_(NULL),
should_skip_(false),
skip_repeat_(false),
save_comments_(false),
comments_(NULL),
comments_offset_(0) {
@@ -63,15 +65,31 @@ Scanner::~Scanner() {
delete comments_;
}
void Scanner::SetSkipExpression(const char* re) {
delete skip_;
if (re != NULL) {
skip_ = new RE(re);
should_skip_ = true;
skip_repeat_ = true;
ConsumeSkip();
} else {
skip_ = NULL;
should_skip_ = false;
skip_repeat_ = false;
}
}
void Scanner::Skip(const char* re) {
delete skip_;
if (re != NULL) {
skip_ = new RE(re);
should_skip_ = true;
skip_repeat_ = false;
ConsumeSkip();
} else {
skip_ = NULL;
should_skip_ = false;
skip_repeat_ = false;
}
}
@@ -118,19 +136,22 @@ bool Scanner::Consume(const RE& re,
// helper function to consume *skip_ and honour save_comments_
void Scanner::ConsumeSkip() {
const char* start_data = input_.data();
while (skip_->Consume(&input_)) {
if (!skip_repeat_) {
// Only one skip allowed.
break;
}
}
if (save_comments_) {
if (NULL == comments_) {
if (comments_ == NULL) {
comments_ = new vector<StringPiece>;
}
const char *start_data = input_.data();
skip_->Consume(&input_);
// already pointing one past end, so no need to +1
int length = input_.data() - start_data;
if (length > 0) {
comments_->push_back(StringPiece(start_data, length));
}
} else {
skip_->Consume(&input_);
}
}

View File

@@ -36,7 +36,7 @@
// Scanner scanner(input);
// string var;
// int number;
// scanner.Skip("\\s+"); // Skip any white space we encounter
// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
// ...;
// }
@@ -90,10 +90,16 @@ class Scanner {
// skipped. For example, a programming language scanner would use
// a skip RE that matches white space and comments.
//
// scanner.Skip("(\\s|//.*|/[*](.|\n)*?[*]/)*");
// scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
//
// Skipping repeats as long as it succeeds. We used to let people do
// this by writing "(...)*" in the regular expression, but that added
// up to lots of recursive calls within the pcre library, so now we
// control repetition explicitly via the function call API.
//
// You can pass NULL for "re" if you do not want any data to be skipped.
void Skip(const char* re);
void Skip(const char* re); // DEPRECATED; does *not* repeat
void SetSkipExpression(const char* re);
// Temporarily pause "skip"ing. This
// Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
@@ -109,12 +115,13 @@ class Scanner {
/***** Special wrappers around SetSkip() for some common idioms *****/
// Arranges to skip whitespace, C comments, C++ comments.
// The overall RE is a repeated disjunction of the following REs:
// The overall RE is a disjunction of the following REs:
// \\s whitespace
// //.*\n C++ comment
// /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x)
// We get repetition via the semantics of SetSkipExpression, not by using *
void SkipCXXComments() {
Skip("((\\s|//.*\n|/[*](.|\n)*?[*]/)*)");
SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
}
void set_save_comments(bool comments) {
@@ -143,6 +150,7 @@ class Scanner {
StringPiece input_; // Unprocessed input
RE* skip_; // If non-NULL, RE for skipping input
bool should_skip_; // If true, use skip_
bool skip_repeat_; // If true, repeat skip_ as long as it works
bool save_comments_; // If true, aggregate the skip expression
// the skipped comments

View File

@@ -33,10 +33,13 @@
// functionality.
#include <stdio.h>
#include <string>
#include <vector>
#include <pcre_stringpiece.h>
#include <pcre_scanner.h>
#define FLAGS_unittest_stack_size 49152
// Dies with a fatal error if the two values are not equal.
#define CHECK_EQ(a, b) do { \
if ( (a) != (b) ) { \
@@ -116,8 +119,31 @@ static void TestScanner() {
comments.resize(0);
}
static void TestBigComment() {
string input;
for (int i = 0; i < 1024; ++i) {
char buf[1024];
snprintf(buf, sizeof(buf), " # Comment %d\n", i);
input += buf;
}
input += "name = value;\n";
Scanner s(input.c_str());
s.SetSkipExpression("\\s+|#.*\n");
string name;
string value;
s.Consume("(\\w+) = (\\w+);", &name, &value);
CHECK_EQ(name, "name");
CHECK_EQ(value, "value");
}
// TODO: also test scanner and big-comment in a thread with a
// small stack size
int main(int argc, char** argv) {
TestScanner();
TestBigComment();
// Done
printf("OK\n");

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,11 @@ supporting functions. */
#include "pcre_internal.h"
/* Returns from set_start_bits() */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
/*************************************************
* Set a bit and maybe its alternate case *
*************************************************/
@@ -72,12 +77,16 @@ if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
/*************************************************
* Create bitmap of starting chars *
* Create bitmap of starting bytes *
*************************************************/
/* This function scans a compiled unanchored expression and attempts to build a
bitmap of the set of initial characters. If it can't, it returns FALSE. As time
goes by, we may be able to get more clever at doing this.
/* This function scans a compiled unanchored expression recursively and
attempts to build a bitmap of the set of possible starting bytes. As time goes
by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
useful for parenthesized groups in patterns such as (a*)b where the group
provides some optional starting bytes but scanning must continue at the outer
level to find at least one mandatory byte. At the outermost level, this
function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
@@ -86,14 +95,17 @@ Arguments:
utf8 TRUE if in UTF-8 mode
cd the block with char table pointers
Returns: TRUE if table built, FALSE otherwise
Returns: SSB_FAIL => Failed to find any starting bytes
SSB_DONE => Found mandatory starting bytes
SSB_CONTINUE => Found optional starting bytes
*/
static BOOL
static int
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
BOOL utf8, compile_data *cd)
{
register int c;
int yield = SSB_DONE;
#if 0
/* ========================================================================= */
@@ -114,25 +126,55 @@ volatile int dummy;
do
{
const uschar *tcode = code + 1 + LINK_SIZE;
const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
while (try_next)
while (try_next) /* Loop for items in this branch */
{
/* If a branch starts with a bracket or a positive lookahead assertion,
recurse to set bits from within them. That's all for this branch. */
if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
int rc;
switch(*tcode)
{
if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
return FALSE;
try_next = FALSE;
}
/* Fail if we reach something we don't understand */
else switch(*tcode)
{
default:
return FALSE;
return SSB_FAIL;
/* If we hit a bracket or a positive lookahead assertion, recurse to set
bits from within the subpattern. If it can't find anything, we have to
give up. If it finds some mandatory character(s), we are done for this
branch. Otherwise, carry on scanning after the subpattern. */
case OP_BRA:
case OP_SBRA:
case OP_CBRA:
case OP_SCBRA:
case OP_ONCE:
case OP_ASSERT:
rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
if (rc == SSB_FAIL) return SSB_FAIL;
if (rc == SSB_DONE) try_next = FALSE; else
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
}
break;
/* If we hit ALT or KET, it means we haven't found anything mandatory in
this branch, though we might have found something optional. For ALT, we
continue with the next alternative, but we have to arrange that the final
result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
return SSB_CONTINUE: if this is the top level, that indicates failure,
but after a nested subpattern, it causes scanning to continue. */
case OP_ALT:
yield = SSB_CONTINUE;
try_next = FALSE;
break;
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
return SSB_CONTINUE;
/* Skip over callout */
@@ -140,19 +182,13 @@ do
tcode += 2 + 2*LINK_SIZE;
break;
/* Skip over extended extraction bracket number */
case OP_BRANUMBER:
tcode += 3;
break;
/* Skip over lookbehind and negative lookahead assertions */
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1+LINK_SIZE;
tcode += 1 + LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
@@ -166,27 +202,30 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
return FALSE;
if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
return SSB_FAIL;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
dummy = 1;
========================================================================= */
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1+LINK_SIZE;
tcode += 1 + LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
#ifdef SUPPORT_UTF8
if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
@@ -194,10 +233,12 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
#ifdef SUPPORT_UTF8
if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
@@ -210,6 +251,7 @@ do
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
try_next = FALSE;
break;
@@ -283,16 +325,19 @@ do
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
tcode += 2; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
case OP_ANY:
return FALSE;
return SSB_FAIL;
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
@@ -418,7 +463,7 @@ do
code += GET(code, 1); /* Advance to next branch */
}
while (*code == OP_ALT);
return TRUE;
return yield;
}
@@ -492,8 +537,8 @@ compile_block.ctypes = tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
(re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -72,9 +72,8 @@ first byte of a character, indexed by the number of additional bytes. */
const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/* Table of the number of extra characters, indexed by the first character
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
0x3d. */
/* Table of the number of extra bytes, indexed by the first byte masked with
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
const uschar _pcre_utf8_table4[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -89,6 +88,7 @@ const ucp_type_table _pcre_utt[] = {
{ "Any", PT_ANY, 0 },
{ "Arabic", PT_SC, ucp_Arabic },
{ "Armenian", PT_SC, ucp_Armenian },
{ "Balinese", PT_SC, ucp_Balinese },
{ "Bengali", PT_SC, ucp_Bengali },
{ "Bopomofo", PT_SC, ucp_Bopomofo },
{ "Braille", PT_SC, ucp_Braille },
@@ -104,6 +104,7 @@ const ucp_type_table _pcre_utt[] = {
{ "Common", PT_SC, ucp_Common },
{ "Coptic", PT_SC, ucp_Coptic },
{ "Cs", PT_PC, ucp_Cs },
{ "Cuneiform", PT_SC, ucp_Cuneiform },
{ "Cypriot", PT_SC, ucp_Cypriot },
{ "Cyrillic", PT_SC, ucp_Cyrillic },
{ "Deseret", PT_SC, ucp_Deseret },
@@ -146,6 +147,7 @@ const ucp_type_table _pcre_utt[] = {
{ "N", PT_GC, ucp_N },
{ "Nd", PT_PC, ucp_Nd },
{ "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue },
{ "Nko", PT_SC, ucp_Nko },
{ "Nl", PT_PC, ucp_Nl },
{ "No", PT_PC, ucp_No },
{ "Ogham", PT_SC, ucp_Ogham },
@@ -158,6 +160,8 @@ const ucp_type_table _pcre_utt[] = {
{ "Pd", PT_PC, ucp_Pd },
{ "Pe", PT_PC, ucp_Pe },
{ "Pf", PT_PC, ucp_Pf },
{ "Phags_Pa", PT_SC, ucp_Phags_Pa },
{ "Phoenician", PT_SC, ucp_Phoenician },
{ "Pi", PT_PC, ucp_Pi },
{ "Po", PT_PC, ucp_Po },
{ "Ps", PT_PC, ucp_Ps },

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -131,11 +131,11 @@ letter, return the other case. Otherwise, return -1.
Arguments:
c the character value
Returns: the other case or -1 if none
Returns: the other case or NOTACHAR if none
*/
int
_pcre_ucp_othercase(const int c)
unsigned int
_pcre_ucp_othercase(const unsigned int c)
{
int bot = 0;
int top = sizeof(ucp_table)/sizeof(cnode);
@@ -161,14 +161,14 @@ for (;;)
}
}
/* Found an entry in the table. Return -1 for a range entry. Otherwise return
the other case if there is one, else -1. */
/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
return the other case if there is one, else NOTACHAR. */
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return -1;
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
offset = ucp_table[mid].f1 & f1_casemask;
if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
return (offset == 0)? -1 : c + offset;
return (offset == 0)? NOTACHAR : c + offset;
}

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -79,7 +79,7 @@ for (p = string; length-- > 0; p++)
register int ab;
register int c = *p;
if (c < 128) continue;
if ((c & 0xc0) != 0xc0) return p - string;
if (c < 0xc0) return p - string;
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
if (length < ab) return p - string;
length -= ab;

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -49,16 +49,38 @@ string that identifies the PCRE version that is in use. */
* Return version string *
*************************************************/
/* These macros are the standard way of turning unquoted text into C strings.
They allow macros like PCRE_MAJOR to be defined without quotes, which is
convenient for user programs that want to test its value. */
#define STRING(a) # a
#define XSTRING(s) STRING(s)
/* A problem turned up with PCRE_PRERELEASE, which is defined empty for
production releases. Originally, it was used naively in this code:
return XSTRING(PCRE_MAJOR)
"." XSTRING(PCRE_MINOR)
XSTRING(PCRE_PRERELEASE)
" " XSTRING(PCRE_DATE);
However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of
STRING(). The C standard states: "If (before argument substitution) any
argument consists of no preprocessing tokens, the behavior is undefined." It
turns out the gcc treats this case as a single empty string - which is what we
really want - but Visual C grumbles about the lack of an argument for the
macro. Unfortunately, both are within their rights. To cope with both ways of
handling this, I had resort to some messy hackery that does a test at run time.
I could find no way of detecting that a macro is defined as an empty string at
pre-processor time. This hack uses a standard trick for avoiding calling
the STRING macro with an empty argument when doing the test. */
PCRE_DATA_SCOPE const char *
pcre_version(void)
{
return XSTRING(PCRE_MAJOR)
"." XSTRING(PCRE_MINOR)
XSTRING(PCRE_PRERELEASE)
" " XSTRING(PCRE_DATE);
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE);
}
/* End of pcre_version.c */

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -61,7 +61,7 @@ static const string empty_string;
// If the user doesn't ask for any options, we just use this one
static RE_Options default_options;
void RE::Init(const char* pat, const RE_Options* options) {
void RE::Init(const string& pat, const RE_Options* options) {
pattern_ = pat;
if (options == NULL) {
options_ = default_options;
@@ -78,7 +78,7 @@ void RE::Init(const char* pat, const RE_Options* options) {
// conservative in that it may treat some "simple" patterns
// as "complex" (e.g., if the vertical bar is in a character
// class or is escaped). But it seems good enough.
if (strchr(pat, '|') == NULL) {
if (strchr(pat.c_str(), '|') == NULL) {
// Simple pattern: we can use position-based checks to perform
// fully anchored matches
re_full_ = re_partial_;
@@ -89,12 +89,18 @@ void RE::Init(const char* pat, const RE_Options* options) {
}
}
RE::~RE() {
void RE::Cleanup() {
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);
if (re_partial_ != NULL) (*pcre_free)(re_partial_);
if (error_ != &empty_string) delete error_;
}
RE::~RE() {
Cleanup();
}
pcre* RE::Compile(Anchor anchor) {
// First, convert RE_Options into pcre options
int pcre_options = 0;
@@ -424,6 +430,34 @@ bool RE::Extract(const StringPiece& rewrite,
return Rewrite(out, rewrite, text, vec, matches);
}
/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
string result;
// Escape any ascii character not in [A-Za-z_0-9].
//
// Note that it's legal to escape a character even if it has no
// special meaning in a regular expression -- so this function does
// that. (This also makes it identical to the perl function of the
// same name; see `perldoc -f quotemeta`.)
for (int ii = 0; ii < unquoted.size(); ++ii) {
// Note that using 'isalnum' here raises the benchmark time from
// 32ns to 58ns:
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
(unquoted[ii] < '0' || unquoted[ii] > '9') &&
unquoted[ii] != '_' &&
// If this is the part of a UTF8 or Latin1 character, we need
// to copy this byte without escaping. Experimentally this is
// what works correctly with the regexp library.
!(unquoted[ii] & 128)) {
result += '\\';
}
result += unquoted[ii];
}
return result;
}
/***** Actual matching and rewriting code *****/
int RE::TryMatch(const StringPiece& text,
@@ -809,14 +843,14 @@ bool Arg::parse_float(const char* str, int n, void* dest) {
return parse_##name##_radix(str, n, dest, 0); \
}
DEFINE_INTEGER_PARSERS(short);
DEFINE_INTEGER_PARSERS(ushort);
DEFINE_INTEGER_PARSERS(int);
DEFINE_INTEGER_PARSERS(uint);
DEFINE_INTEGER_PARSERS(long);
DEFINE_INTEGER_PARSERS(ulong);
DEFINE_INTEGER_PARSERS(longlong);
DEFINE_INTEGER_PARSERS(ulonglong);
DEFINE_INTEGER_PARSERS(short) /* */
DEFINE_INTEGER_PARSERS(ushort) /* */
DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
DEFINE_INTEGER_PARSERS(longlong) /* */
DEFINE_INTEGER_PARSERS(ulonglong) /* */
#undef DEFINE_INTEGER_PARSERS

View File

@@ -112,6 +112,12 @@
// T (where "bool T::ParseFrom(const char*, int)" exists)
// NULL (the corresponding matched sub-pattern is not copied)
//
// CAVEAT: An optional sub-pattern that does not exist in the matched
// string is assigned the empty string. Therefore, the following will
// return false (because the empty string is not a valid number):
// int number;
// pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
//
// -----------------------------------------------------------------------
// DO_MATCH
//
@@ -488,8 +494,25 @@ class RE {
// pass in a string or a "const char*" wherever an "RE" is expected.
RE(const char* pat) { Init(pat, NULL); }
RE(const char *pat, const RE_Options& option) { Init(pat, &option); }
RE(const string& pat) { Init(pat.c_str(), NULL); }
RE(const string& pat, const RE_Options& option) { Init(pat.c_str(), &option); }
RE(const string& pat) { Init(pat, NULL); }
RE(const string& pat, const RE_Options& option) { Init(pat, &option); }
// Copy constructor & assignment - note that these are expensive
// because they recompile the expression.
RE(const RE& re) { Init(re.pattern_, &re.options_); }
const RE& operator=(const RE& re) {
if (this != &re) {
Cleanup();
// This is the code that originally came from Google
// Init(re.pattern_.c_str(), &re.options_);
// This is the replacement from Ari Pollak
Init(re.pattern_, &re.options_);
}
return *this;
}
~RE();
@@ -589,6 +612,15 @@ class RE {
const StringPiece &text,
string *out) const;
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
/***** Generic matching interface *****/
// Type of match (TODO: Should be restructured as part of RE_Options)
@@ -611,7 +643,8 @@ class RE {
private:
void Init(const char* pattern, const RE_Options* options);
void Init(const string& pattern, const RE_Options* options);
void Cleanup();
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
// pairs of integers for the beginning and end positions of matched
@@ -655,11 +688,6 @@ class RE {
pcre* re_full_; // For full matches
pcre* re_partial_; // For partial matches
const string* error_; // Error indicator (or points to empty string)
// Don't allow the default copy or assignment constructors --
// they're expensive and too easy to do by accident.
RE(const RE&);
void operator=(const RE&);
};
} // namespace pcrecpp

View File

@@ -1,4 +1,6 @@
// Copyright (c) 2005, Google Inc.
// -*- coding: utf-8 -*-
//
// Copyright (c) 2005 - 2006, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
@@ -445,6 +447,80 @@ static void TestRecursion() {
CHECK(re4.FullMatch(text_bad) == false);
}
// A meta-quoted string, interpreted as a pattern, should always match
// the original unquoted string.
static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) {
string quoted = RE::QuoteMeta(unquoted);
RE re(quoted, options);
CHECK(re.FullMatch(unquoted));
}
// A string containing meaningful regexp characters, which is then meta-
// quoted, should not generally match a string the unquoted string does.
static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
RE_Options options = RE_Options()) {
string quoted = RE::QuoteMeta(unquoted);
RE re(quoted, options);
CHECK(!re.FullMatch(should_not_match));
}
// Tests that quoted meta characters match their original strings,
// and that a few things that shouldn't match indeed do not.
static void TestQuotaMetaSimple() {
TestQuoteMeta("foo");
TestQuoteMeta("foo.bar");
TestQuoteMeta("foo\\.bar");
TestQuoteMeta("[1-9]");
TestQuoteMeta("1.5-2.0?");
TestQuoteMeta("\\d");
TestQuoteMeta("Who doesn't like ice cream?");
TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
TestQuoteMeta("((?!)xxx).*yyy");
TestQuoteMeta("([");
}
static void TestQuoteMetaSimpleNegative() {
NegativeTestQuoteMeta("foo", "bar");
NegativeTestQuoteMeta("...", "bar");
NegativeTestQuoteMeta("\\.", ".");
NegativeTestQuoteMeta("\\.", "..");
NegativeTestQuoteMeta("(a)", "a");
NegativeTestQuoteMeta("(a|b)", "a");
NegativeTestQuoteMeta("(a|b)", "(a)");
NegativeTestQuoteMeta("(a|b)", "a|b");
NegativeTestQuoteMeta("[0-9]", "0");
NegativeTestQuoteMeta("[0-9]", "0-9");
NegativeTestQuoteMeta("[0-9]", "[9]");
NegativeTestQuoteMeta("((?!)xxx)", "xxx");
}
static void TestQuoteMetaLatin1() {
TestQuoteMeta("3\xb2 = 9");
}
static void TestQuoteMetaUtf8() {
#ifdef SUPPORT_UTF8
TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8
TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol)
TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character
TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime)
TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note)
TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work
NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol)
"27\\\xc2\\\xb0",
pcrecpp::UTF8());
#endif
}
static void TestQuoteMetaAll() {
printf("Testing QuoteMeta\n");
TestQuotaMetaSimple();
TestQuoteMetaSimpleNegative();
TestQuoteMetaLatin1();
TestQuoteMetaUtf8();
}
//
// Options tests contributed by
// Giuseppe Maxia, CTO, Stardata s.r.l.
@@ -667,6 +743,35 @@ static void TestOptions() {
Test_all_options();
}
static void TestConstructors() {
printf("Testing constructors\n");
RE_Options options;
options.set_dotall(true);
const char *str = "HELLO\n" "cruel\n" "world";
RE orig("HELLO.*world", options);
CHECK(orig.FullMatch(str));
RE copy1(orig);
CHECK(copy1.FullMatch(str));
RE copy2("not a match");
CHECK(!copy2.FullMatch(str));
copy2 = copy1;
CHECK(copy2.FullMatch(str));
copy2 = orig;
CHECK(copy2.FullMatch(str));
// Make sure when we assign to ourselves, nothing bad happens
orig = orig;
copy1 = copy1;
copy2 = copy2;
CHECK(orig.FullMatch(str));
CHECK(copy1.FullMatch(str));
CHECK(copy2.FullMatch(str));
}
int main(int argc, char** argv) {
// Treat any flag as --help
if (argc > 1 && argv[1][0] == '-') {
@@ -985,11 +1090,14 @@ int main(int argc, char** argv) {
CHECK(RE("h.*o").PartialMatch("hello!"));
CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x"));
/***** other tests *****/
RadixTests();
TestReplace();
TestExtract();
TestConsume();
TestFindAndConsume();
TestQuoteMetaAll();
TestMatchNumberPeculiarity();
// Check the pattern() accessor
@@ -1109,6 +1217,9 @@ int main(int argc, char** argv) {
VERBOSE_TEST = true;
TestOptions();
// Test the constructors
TestConstructors();
// Done
printf("OK\n");

View File

@@ -6,7 +6,7 @@
its pattern matching. On a Unix or Win32 system it can recurse into
directories.
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@ POSSIBILITY OF SUCH DAMAGE.
typedef int BOOL;
#define VERSION "4.3 01-Jun-2006"
#define VERSION "4.4 29-Nov-2006"
#define MAX_PATTERN_COUNT 100
#if BUFSIZ > 8192
@@ -65,7 +65,6 @@ typedef int BOOL;
#define MBUFTHIRD 8192
#endif
/* Values for the "filenames" variable, which specifies options for file name
output. The order is important; it is assumed that a file name is wanted for
all values greater than FN_DEFAULT. */
@@ -83,6 +82,10 @@ enum { DEE_READ, DEE_SKIP };
#define PO_LINE_MATCH 0x0002
#define PO_FIXED_STRINGS 0x0004
/* Line ending types */
enum { EL_LF, EL_CR, EL_CRLF, EL_ANY };
/*************************************************
@@ -100,8 +103,7 @@ static const char *jfriedl_prefix = "";
static const char *jfriedl_postfix = "";
#endif
static int endlinebyte = '\n'; /* Last byte of endline sequence */
static int endlineextra = 0; /* Extra bytes for endline sequence */
static int endlinetype;
static char *colour_string = (char *)"1;31";
static char *colour_option = NULL;
@@ -142,6 +144,7 @@ static BOOL number = FALSE;
static BOOL only_matching = FALSE;
static BOOL quiet = FALSE;
static BOOL silent = FALSE;
static BOOL utf8 = FALSE;
/* Structure for options and list of them */
@@ -219,6 +222,16 @@ static const char *prefix[] = {
static const char *suffix[] = {
"", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" };
/* UTF-8 tables - used only when the newline setting is "all". */
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
const char utf8_table4[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
/*************************************************
@@ -470,6 +483,216 @@ return sys_errlist[n];
/*************************************************
* Find end of line *
*************************************************/
/* The length of the endline sequence that is found is set via lenptr. This may
be zero at the very end of the file if there is no line-ending sequence there.
Arguments:
p current position in line
endptr end of available data
lenptr where to put the length of the eol sequence
Returns: pointer to the last byte of the line
*/
static char *
end_of_line(char *p, char *endptr, int *lenptr)
{
switch(endlinetype)
{
default: /* Just in case */
case EL_LF:
while (p < endptr && *p != '\n') p++;
if (p < endptr)
{
*lenptr = 1;
return p + 1;
}
*lenptr = 0;
return endptr;
case EL_CR:
while (p < endptr && *p != '\r') p++;
if (p < endptr)
{
*lenptr = 1;
return p + 1;
}
*lenptr = 0;
return endptr;
case EL_CRLF:
for (;;)
{
while (p < endptr && *p != '\r') p++;
if (++p >= endptr)
{
*lenptr = 0;
return endptr;
}
if (*p == '\n')
{
*lenptr = 2;
return p + 1;
}
}
break;
case EL_ANY:
while (p < endptr)
{
int extra = 0;
register int c = *((unsigned char *)p);
if (utf8 && c >= 0xc0)
{
int gcii, gcss;
extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
gcss = 6*extra;
c = (c & utf8_table3[extra]) << gcss;
for (gcii = 1; gcii <= extra; gcii++)
{
gcss -= 6;
c |= (p[gcii] & 0x3f) << gcss;
}
}
p += 1 + extra;
switch (c)
{
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
*lenptr = 1;
return p;
case 0x0d: /* CR */
if (p < endptr && *p == 0x0a)
{
*lenptr = 2;
p++;
}
else *lenptr = 1;
return p;
case 0x85: /* NEL */
*lenptr = utf8? 2 : 1;
return p;
case 0x2028: /* LS */
case 0x2029: /* PS */
*lenptr = 3;
return p;
default:
break;
}
} /* End of loop for ANY case */
*lenptr = 0; /* Must have hit the end */
return endptr;
} /* End of overall switch */
}
/*************************************************
* Find start of previous line *
*************************************************/
/* This is called when looking back for before lines to print.
Arguments:
p start of the subsequent line
startptr start of available data
Returns: pointer to the start of the previous line
*/
static char *
previous_line(char *p, char *startptr)
{
switch(endlinetype)
{
default: /* Just in case */
case EL_LF:
p--;
while (p > startptr && p[-1] != '\n') p--;
return p;
case EL_CR:
p--;
while (p > startptr && p[-1] != '\n') p--;
return p;
case EL_CRLF:
for (;;)
{
p -= 2;
while (p > startptr && p[-1] != '\n') p--;
if (p <= startptr + 1 || p[-2] == '\r') return p;
}
return p; /* But control should never get here */
case EL_ANY:
if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--;
if (utf8) while ((*p & 0xc0) == 0x80) p--;
while (p > startptr)
{
register int c;
char *pp = p - 1;
if (utf8)
{
int extra = 0;
while ((*pp & 0xc0) == 0x80) pp--;
c = *((unsigned char *)pp);
if (c >= 0xc0)
{
int gcii, gcss;
extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
gcss = 6*extra;
c = (c & utf8_table3[extra]) << gcss;
for (gcii = 1; gcii <= extra; gcii++)
{
gcss -= 6;
c |= (pp[gcii] & 0x3f) << gcss;
}
}
}
else c = *((unsigned char *)pp);
switch (c)
{
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LS */
case 0x2029: /* PS */
return p;
default:
break;
}
p = pp; /* Back one character */
} /* End of loop for ANY case */
return startptr; /* Hit start of data */
} /* End of overall switch */
}
/*************************************************
* Print the previous "after" lines *
*************************************************/
@@ -495,13 +718,13 @@ if (after_context > 0 && lastmatchnumber > 0)
int count = 0;
while (lastmatchrestart < endptr && count++ < after_context)
{
int ellength;
char *pp = lastmatchrestart;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
while (*pp != endlinebyte) pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra),
stdout);
lastmatchrestart = pp + 1;
pp = end_of_line(pp, endptr, &ellength);
fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp;
}
hyphenpending = TRUE;
}
@@ -558,7 +781,7 @@ way, the buffer is shifted left and re-filled. */
while (ptr < endptr)
{
int i;
int i, endlinelength;
int mrc = 0;
BOOL match = FALSE;
char *t = ptr;
@@ -571,11 +794,10 @@ while (ptr < endptr)
line. In multiline mode the PCRE_FIRSTLINE option is used for compiling, so
that any match is constrained to be in the first line. */
linelength = 0;
while (t < endptr && *t++ != endlinebyte) linelength++;
t = end_of_line(t, endptr, &endlinelength);
linelength = t - ptr - endlinelength;
length = multiline? endptr - ptr : linelength;
/* Extra processing for Jeffrey Friedl's debugging. */
#ifdef JFRIEDL_DEBUG
@@ -706,13 +928,13 @@ while (ptr < endptr)
if (after_context > 0 && lastmatchnumber > 0)
{
int ellength;
int linecount = 0;
char *p = lastmatchrestart;
while (p < ptr && linecount < after_context)
{
while (*p != endlinebyte) p++;
p++;
p = end_of_line(p, ptr, &ellength);
linecount++;
}
@@ -725,10 +947,9 @@ while (ptr < endptr)
char *pp = lastmatchrestart;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
while (*pp != endlinebyte) pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart +
(1 + endlineextra), stdout);
lastmatchrestart = pp + 1;
pp = end_of_line(pp, endptr, &ellength);
fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp;
}
if (lastmatchrestart != ptr) hyphenpending = TRUE;
}
@@ -754,8 +975,7 @@ while (ptr < endptr)
linecount < before_context)
{
linecount++;
p--;
while (p > buffer && p[-1] != endlinebyte) p--;
p = previous_line(p, buffer);
}
if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted)
@@ -763,12 +983,13 @@ while (ptr < endptr)
while (p < ptr)
{
int ellength;
char *pp = p;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", linenumber - linecount--);
while (*pp != endlinebyte) pp++;
fwrite(p, 1, pp - p + (1 + endlineextra), stdout);
p = pp + 1;
pp = end_of_line(pp, endptr, &ellength);
fwrite(p, 1, pp - p, stdout);
p = pp;
}
}
@@ -788,11 +1009,16 @@ while (ptr < endptr)
if (multiline)
{
int ellength;
char *endmatch = ptr + offsets[1];
t = ptr;
while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; }
while (endmatch < endptr && *endmatch != endlinebyte) endmatch++;
linelength = endmatch - ptr;
while (t < endmatch)
{
t = end_of_line(t, endptr, &ellength);
if (t <= endmatch) linenumber++; else break;
}
endmatch = end_of_line(endmatch, endptr, &ellength);
linelength = endmatch - ptr - ellength;
}
/*** NOTE: Use only fwrite() to output the data line, so that binary
@@ -824,9 +1050,7 @@ while (ptr < endptr)
fprintf(stdout, "%c[00m", 0x1b);
fwrite(ptr + offsets[1], 1, linelength - offsets[1], stdout);
}
else fwrite(ptr, 1, linelength, stdout);
fprintf(stdout, "\n");
else fwrite(ptr, 1, linelength + endlinelength, stdout);
}
/* End of doing what has to be done for a match */
@@ -836,13 +1060,13 @@ while (ptr < endptr)
/* Remember where the last match happened for after_context. We remember
where we are about to restart, and that line's number. */
lastmatchrestart = ptr + linelength + 1;
lastmatchrestart = ptr + linelength + endlinelength;
lastmatchnumber = linenumber + 1;
}
/* Advance to after the newline and increment the line number. */
ptr += linelength + 1;
ptr += linelength + endlinelength;
linenumber++;
/* If we haven't yet reached the end of the file (the buffer is full), and
@@ -1098,7 +1322,7 @@ switch(letter)
case 'q': quiet = TRUE; break;
case 'r': dee_action = dee_RECURSE; break;
case 's': silent = TRUE; break;
case 'u': options |= PCRE_UTF8; break;
case 'u': options |= PCRE_UTF8; utf8 = TRUE; break;
case 'v': invert = TRUE; break;
case 'w': process_options |= PO_WORD_MATCH; break;
case 'x': process_options |= PO_LINE_MATCH; break;
@@ -1231,14 +1455,16 @@ compile_pattern(char *pattern, int options, char *filename, int count)
{
if ((process_options & PO_FIXED_STRINGS) != 0)
{
char *eop = pattern + strlen(pattern);
char buffer[MBUFTHIRD];
for(;;)
{
char *p = strchr(pattern, endlinebyte);
if (p == NULL)
int ellength;
char *p = end_of_line(pattern, eop, &ellength);
if (ellength == 0)
return compile_single_pattern(pattern, options, filename, count);
sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern);
pattern = p + 1;
sprintf(buffer, "%.*s", p - pattern - ellength, pattern);
pattern = p;
if (!compile_single_pattern(buffer, options, filename, count))
return FALSE;
}
@@ -1267,7 +1493,9 @@ char *patterns[MAX_PATTERN_COUNT];
const char *locale_from = "--locale";
const char *error;
/* Set the default line ending value from the default in the PCRE library. */
/* Set the default line ending value from the default in the PCRE library;
"lf", "cr", "crlf", and "any" are supported. Anything else is treated as "lf".
*/
(void)pcre_config(PCRE_CONFIG_NEWLINE, &i);
switch(i)
@@ -1275,6 +1503,7 @@ switch(i)
default: newline = (char *)"lf"; break;
case '\r': newline = (char *)"cr"; break;
case ('\r' << 8) | '\n': newline = (char *)"crlf"; break;
case -1: newline = (char *)"any"; break;
}
/* Process the options */
@@ -1565,16 +1794,22 @@ if (colour_option != NULL && strcmp(colour_option, "never") != 0)
if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0)
{
pcre_options |= PCRE_NEWLINE_CR;
endlinebyte = '\r';
endlinetype = EL_CR;
}
else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0)
{
pcre_options |= PCRE_NEWLINE_LF;
endlinetype = EL_LF;
}
else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0)
{
pcre_options |= PCRE_NEWLINE_CRLF;
endlineextra = 1;
endlinetype = EL_CRLF;
}
else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0)
{
pcre_options |= PCRE_NEWLINE_ANY;
endlinetype = EL_ANY;
}
else
{

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ static const int eint[] = {
REG_BADPAT, /* unrecognized character after (?< */
REG_BADPAT, /* lookbehind assertion is not fixed length */
REG_BADPAT, /* malformed number or name after (?( */
REG_BADPAT, /* conditional group containe more than two branches */
REG_BADPAT, /* conditional group contains more than two branches */
REG_BADPAT, /* assertion expected after (?( */
REG_BADPAT, /* (?R or (?digits must be followed by ) */
REG_ECTYPE, /* unknown POSIX class name */
@@ -93,7 +93,7 @@ static const int eint[] = {
REG_BADPAT, /* closing ) for (?C expected */
REG_BADPAT, /* recursive call could loop indefinitely */
REG_BADPAT, /* unrecognized character after (?P */
REG_BADPAT, /* syntax error after (?P */
REG_BADPAT, /* syntax error in subpattern name (missing terminator) */
REG_BADPAT, /* two named subpatterns have the same name */
REG_BADPAT, /* invalid UTF-8 string */
REG_BADPAT, /* support for \P, \p, and \X has not been compiled */
@@ -102,7 +102,13 @@ static const int eint[] = {
REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */
REG_BADPAT, /* too many named subpatterns (maximum 10,000) */
REG_BADPAT, /* repeated subpattern is too long */
REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */
REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */
REG_BADPAT, /* internal error: overran compiling workspace */
REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */
REG_BADPAT, /* DEFINE group contains more than one branch */
REG_BADPAT, /* repeating a DEFINE group is not allowed */
REG_INVARG, /* inconsistent NEWLINE options */
REG_BADPAT /* \g is not followed followed by an (optionally braced) non-zero number */
};
/* Table of texts corresponding to POSIX error codes */

View File

@@ -9,7 +9,7 @@
Compatible Regular Expression library. It defines the things POSIX says should
be there. I hope.
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@@ -44,10 +44,29 @@ POSSIBILITY OF SUCH DAMAGE.
#include <locale.h>
#include <errno.h>
#ifndef _WIN32
#include <sys/resource.h>
/* A number of things vary for Windows builds. Originally, pcretest opened its
input and output without "b"; then I was told that "b" was needed in some
environments, so it was added for release 5.0 to both the input and output. (It
makes no difference on Unix-like systems.) Later I was told that it is wrong
for the input on Windows. I've now abstracted the modes into two macros that
are set here, to make it easier to fiddle with them, and removed "b" from the
input mode under Windows. */
#if defined(_WIN32) || defined(WIN32)
#include <io.h> /* For _setmode() */
#include <fcntl.h> /* For _O_BINARY */
#define INPUT_MODE "r"
#define OUTPUT_MODE "wb"
#else
#include <sys/time.h> /* These two includes are needed */
#include <sys/resource.h> /* for setrlimit(). */
#define INPUT_MODE "rb"
#define OUTPUT_MODE "wb"
#endif
#define PCRE_SPY /* For Win32 build, import data, not export */
/* We include pcre_internal.h because we need the internal info for displaying
@@ -74,10 +93,18 @@ symbols to prevent clashes. */
/* We also need the pcre_printint() function for printing out compiled
patterns. This function is in a separate file so that it can be included in
pcre_compile.c when that module is compiled with debugging enabled. */
pcre_compile.c when that module is compiled with debugging enabled.
The definition of the macro PRINTABLE, which determines whether to print an
output character as-is or as a hex value when showing compiled patterns, is
contained in this file. We uses it here also, in cases when the locale has not
been explicitly changed, so as to get consistent output from systems that
differ in their output from isprint() even in the "C" locale. */
#include "pcre_printint.src"
#define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c))
/* It is possible to compile this test program without including support for
testing the POSIX interface, though this is not available via the standard
@@ -103,6 +130,8 @@ function (define NOINFOCHECK). */
#endif
#endif
/* This is the default loop count for timing. */
#define LOOPREPEAT 500000
/* Static variables */
@@ -114,6 +143,7 @@ static int callout_extra;
static int callout_fail_count;
static int callout_fail_id;
static int first_callout;
static int locale_set = 0;
static int show_malloc;
static int use_utf8;
static size_t gotten_store;
@@ -157,6 +187,7 @@ uschar *here = start;
for (;;)
{
int rlen = buffer_size - (here - buffer);
if (rlen > 1000)
{
int dlen;
@@ -213,7 +244,7 @@ return NULL; /* Control never gets here */
/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
around with conditional compilation, just do the job by hand. It is only used
for unpicking the -o argument, so just keep it simple.
for unpicking arguments, so just keep it simple.
Arguments:
str string to be converted
@@ -311,6 +342,8 @@ Arguments:
Returns: number of characters placed in the buffer
*/
#if !defined NOUTF8
static int
ord2utf8(int cvalue, uschar *utf8bytes)
{
@@ -327,6 +360,8 @@ for (j = i; j > 0; j--)
return i + 1;
}
#endif
/*************************************************
@@ -353,16 +388,19 @@ while (length-- > 0)
{
length -= rc - 1;
p += rc;
if (c < 256 && isprint(c))
if (PRINTHEX(c))
{
if (f != NULL) fprintf(f, "%c", c);
yield++;
}
else
{
int n;
if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
yield += n;
int n = 4;
if (f != NULL) fprintf(f, "\\x{%02x}", c);
yield += (n <= 0x000000ff)? 2 :
(n <= 0x00000fff)? 3 :
(n <= 0x0000ffff)? 4 :
(n <= 0x000fffff)? 5 : 6;
}
continue;
}
@@ -371,7 +409,8 @@ while (length-- > 0)
/* Not UTF-8, or malformed UTF-8 */
if (isprint(c = *(p++)))
c = *p++;
if (PRINTHEX(c))
{
if (f != NULL) fprintf(f, "%c", c);
yield++;
@@ -614,7 +653,7 @@ return count;
*************************************************/
/* This is used both at compile and run-time to check for <xxx> escapes, where
xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match.
xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.
Arguments:
p points after the leading '<'
@@ -629,12 +668,45 @@ check_newline(uschar *p, FILE *f)
if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;
fprintf(f, "Unknown newline type at: <%s\n", p);
return 0;
}
/*************************************************
* Usage function *
*************************************************/
static void
usage(void)
{
printf("Usage: pcretest [options] [<input> [<output>]]\n");
printf(" -b show compiled code (bytecode)\n");
printf(" -C show PCRE compile-time options and exit\n");
printf(" -d debug: show compiled code and information (-b and -i)\n");
#if !defined NODFA
printf(" -dfa force DFA matching for all subjects\n");
#endif
printf(" -help show usage information\n");
printf(" -i show information about compiled patterns\n"
" -m output memory used information\n"
" -o <n> set size of offsets vector to <n>\n");
#if !defined NOPOSIX
printf(" -p use POSIX interface\n");
#endif
printf(" -q quiet: do not output PCRE version number at start\n");
printf(" -S <n> set stack size to <n> megabytes\n");
printf(" -s output store (memory) used information\n"
" -t time compilation and execution\n");
printf(" -t <n> time compilation and execution, repeating <n> times\n");
printf(" -tm time execution (matching) only\n");
printf(" -tm <n> time execution (matching) only, repeating <n> times\n");
}
/*************************************************
* Main Program *
*************************************************/
@@ -650,6 +722,7 @@ int options = 0;
int study_options = 0;
int op = 1;
int timeit = 0;
int timeitm = 0;
int showinfo = 0;
int showstore = 0;
int quiet = 0;
@@ -681,16 +754,19 @@ buffer = (unsigned char *)malloc(buffer_size);
dbuffer = (unsigned char *)malloc(buffer_size);
pbuffer = (unsigned char *)malloc(buffer_size);
/* The outfile variable is static so that new_malloc can use it. The _setmode()
stuff is some magic that I don't understand, but which apparently does good
things in Windows. It's related to line terminations. */
#if defined(_WIN32) || defined(WIN32)
_setmode( _fileno( stdout ), 0x8000 );
#endif /* defined(_WIN32) || defined(WIN32) */
/* The outfile variable is static so that new_malloc can use it. */
outfile = stdout;
/* The following _setmode() stuff is some Windows magic that tells its runtime
library to translate CRLF into a single LF character. At least, that's what
I've been told: never having used Windows I take this all on trust. Originally
it set 0x8000, but then I was advised that _O_BINARY was better. */
#if defined(_WIN32) || defined(WIN32)
_setmode( _fileno( stdout ), _O_BINARY );
#endif
/* Scan options */
while (argc > 1 && argv[op][0] == '-')
@@ -699,8 +775,8 @@ while (argc > 1 && argv[op][0] == '-')
if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
showstore = 1;
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
else if (strcmp(argv[op], "-b") == 0) debug = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
#if !defined NODFA
@@ -713,11 +789,25 @@ while (argc > 1 && argv[op][0] == '-')
op++;
argc--;
}
else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0)
{
int both = argv[op][2] == 0;
int temp;
if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr),
*endptr == 0))
{
timeitm = temp;
op++;
argc--;
}
else timeitm = LOOPREPEAT;
if (both) timeit = timeitm;
}
else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
*endptr == 0))
{
#ifdef _WIN32
#if defined(_WIN32) || defined(WIN32)
printf("PCRE: -S not supported on this OS\n");
exit(1);
#else
@@ -749,7 +839,8 @@ while (argc > 1 && argv[op][0] == '-')
printf(" %sUnicode properties support\n", rc? "" : "No ");
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
printf(" Newline sequence is %s\n", (rc == '\r')? "CR" :
(rc == '\n')? "LF" : "CRLF");
(rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
(rc == -1)? "ANY" : "???");
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
printf(" Internal link size = %d\n", rc);
(void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
@@ -762,24 +853,16 @@ while (argc > 1 && argv[op][0] == '-')
printf(" Match recursion uses %s\n", rc? "stack" : "heap");
exit(0);
}
else if (strcmp(argv[op], "-help") == 0 ||
strcmp(argv[op], "--help") == 0)
{
usage();
goto EXIT;
}
else
{
printf("** Unknown or malformed option %s\n", argv[op]);
printf("Usage: pcretest [options] [<input> [<output>]]\n");
printf(" -C show PCRE compile-time options and exit\n");
printf(" -d debug: show compiled code; implies -i\n");
#if !defined NODFA
printf(" -dfa force DFA matching for all subjects\n");
#endif
printf(" -i show information about compiled pattern\n"
" -m output memory used information\n"
" -o <n> set size of offsets vector to <n>\n");
#if !defined NOPOSIX
printf(" -p use POSIX interface\n");
#endif
printf(" -S <n> set stack size to <n> megabytes\n");
printf(" -s output store (memory) used information\n"
" -t time compilation and execution\n");
usage();
yield = 1;
goto EXIT;
}
@@ -803,7 +886,7 @@ if (offsets == NULL)
if (argc > 1)
{
infile = fopen(argv[op], "rb");
infile = fopen(argv[op], INPUT_MODE);
if (infile == NULL)
{
printf("** Failed to open %s\n", argv[op]);
@@ -814,7 +897,7 @@ if (argc > 1)
if (argc > 2)
{
outfile = fopen(argv[op+1], "wb");
outfile = fopen(argv[op+1], OUTPUT_MODE);
if (outfile == NULL)
{
printf("** Failed to open %s\n", argv[op+1]);
@@ -859,7 +942,7 @@ while (!done)
int do_showinfo = showinfo;
int do_showrest = 0;
int do_flip = 0;
int erroroffset, len, delimiter;
int erroroffset, len, delimiter, poffset;
use_utf8 = 0;
@@ -969,6 +1052,7 @@ while (!done)
}
pp = p;
poffset = p - buffer;
for(;;)
{
@@ -989,6 +1073,11 @@ while (!done)
if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
}
/* The buffer may have moved while being extended; reset the start of data
pointer to the correct relative point in the buffer. */
p = buffer + poffset;
/* If the first character after the delimiter is backslash, make
the pattern end with backslash. This is purely to provide a way
of testing for the error message when a pattern ends with backslash. */
@@ -1020,6 +1109,7 @@ while (!done)
case '+': do_showrest = 1; break;
case 'A': options |= PCRE_ANCHORED; break;
case 'B': do_debug = 1; break;
case 'C': options |= PCRE_AUTO_CALLOUT; break;
case 'D': do_debug = do_showinfo = 1; break;
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
@@ -1042,14 +1132,16 @@ while (!done)
case 'L':
ppp = pp;
/* The '\r' test here is so that it works on Windows */
while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
/* The '\r' test here is so that it works on Windows. */
/* The '0' test is just in case this is an unterminated line. */
while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
*ppp = 0;
if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
{
fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
goto SKIP_DATA;
}
locale_set = 1;
tables = pcre_maketables();
pp = ppp;
break;
@@ -1116,19 +1208,19 @@ while (!done)
#endif /* !defined NOPOSIX */
{
if (timeit)
if (timeit > 0)
{
register int i;
clock_t time_taken;
clock_t start_time = clock();
for (i = 0; i < LOOPREPEAT; i++)
for (i = 0; i < timeit; i++)
{
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
if (re != NULL) free(re);
}
time_taken = clock() - start_time;
fprintf(outfile, "Compile time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
fprintf(outfile, "Compile time %.4f milliseconds\n",
(((double)time_taken * 1000.0) / (double)timeit) /
(double)CLOCKS_PER_SEC);
}
@@ -1180,17 +1272,17 @@ while (!done)
if (do_study)
{
if (timeit)
if (timeit > 0)
{
register int i;
clock_t time_taken;
clock_t start_time = clock();
for (i = 0; i < LOOPREPEAT; i++)
for (i = 0; i < timeit; i++)
extra = pcre_study(re, study_options, &error);
time_taken = clock() - start_time;
if (extra != NULL) free(extra);
fprintf(outfile, " Study time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
fprintf(outfile, " Study time %.4f milliseconds\n",
(((double)time_taken * 1000.0) / (double)timeit) /
(double)CLOCKS_PER_SEC);
}
extra = pcre_study(re, study_options, &error);
@@ -1233,6 +1325,12 @@ while (!done)
SHOW_INFO:
if (do_debug)
{
fprintf(outfile, "------------------------------------------------------------------\n");
pcre_printint(re, outfile);
}
if (do_showinfo)
{
unsigned long int get_options, all_options;
@@ -1243,12 +1341,6 @@ while (!done)
int nameentrysize, namecount;
const uschar *nametable;
if (do_debug)
{
fprintf(outfile, "------------------------------------------------------------------\n");
pcre_printint(re, outfile);
}
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
new_info(re, NULL, PCRE_INFO_SIZE, &size);
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
@@ -1327,7 +1419,7 @@ while (!done)
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
switch (get_options & PCRE_NEWLINE_CRLF)
switch (get_options & PCRE_NEWLINE_BITS)
{
case PCRE_NEWLINE_CR:
fprintf(outfile, "Forced newline sequence: CR\n");
@@ -1341,6 +1433,10 @@ while (!done)
fprintf(outfile, "Forced newline sequence: CRLF\n");
break;
case PCRE_NEWLINE_ANY:
fprintf(outfile, "Forced newline sequence: ANY\n");
break;
default:
break;
}
@@ -1358,7 +1454,7 @@ while (!done)
int ch = first_char & 255;
const char *caseless = ((first_char & REQ_CASELESS) == 0)?
"" : " (caseless)";
if (isprint(ch))
if (PRINTHEX(ch))
fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
else
fprintf(outfile, "First char = %d%s\n", ch, caseless);
@@ -1373,7 +1469,7 @@ while (!done)
int ch = need_char & 255;
const char *caseless = ((need_char & REQ_CASELESS) == 0)?
"" : " (caseless)";
if (isprint(ch))
if (PRINTHEX(ch))
fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
else
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
@@ -1409,7 +1505,7 @@ while (!done)
fprintf(outfile, "\n ");
c = 2;
}
if (isprint(i) && i != ' ')
if (PRINTHEX(i) && i != ' ')
{
fprintf(outfile, "%c ", i);
c += 2;
@@ -1468,6 +1564,7 @@ while (!done)
strerror(errno));
}
else fprintf(outfile, "Study data written to %s\n", to_file);
}
}
fclose(f);
@@ -1866,7 +1963,7 @@ while (!done)
for (;; gmatched++) /* Loop for /g or /G */
{
if (timeit)
if (timeitm > 0)
{
register int i;
clock_t time_taken;
@@ -1876,7 +1973,7 @@ while (!done)
if (all_use_dfa || use_dfa)
{
int workspace[1000];
for (i = 0; i < LOOPREPEAT; i++)
for (i = 0; i < timeitm; i++)
count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
options | g_notempty, use_offsets, use_size_offsets, workspace,
sizeof(workspace)/sizeof(int));
@@ -1884,13 +1981,13 @@ while (!done)
else
#endif
for (i = 0; i < LOOPREPEAT; i++)
for (i = 0; i < timeitm; i++)
count = pcre_exec(re, extra, (char *)bptr, len,
start_offset, options | g_notempty, use_offsets, use_size_offsets);
time_taken = clock() - start_time;
fprintf(outfile, "Execute time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
fprintf(outfile, "Execute time %.4f milliseconds\n",
(((double)time_taken * 1000.0) / (double)timeitm) /
(double)CLOCKS_PER_SEC);
}
@@ -1966,7 +2063,28 @@ while (!done)
if (count >= 0)
{
int i;
int i, maxcount;
#if !defined NODFA
if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
#endif
maxcount = use_size_offsets/3;
/* This is a check against a lunatic return value. */
if (count > maxcount)
{
fprintf(outfile,
"** PCRE error: returned count %d is too big for offset size %d\n",
count, use_size_offsets);
count = use_size_offsets/3;
if (do_g || do_G)
{
fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G');
do_g = do_G = FALSE; /* Break g/G loop */
}
}
for (i = 0; i < count * 2; i += 2)
{
if (use_offsets[i] < 0)
@@ -2165,6 +2283,7 @@ while (!done)
{
new_free((void *)tables);
setlocale(LC_CTYPE, "C");
locale_set = 0;
}
}

View File

@@ -593,7 +593,8 @@ aaaaa2
ffffffffff
This is a line before the binary zero.
This line contains a binary zero here >This is a line after the binary zero.
This line contains a binary zero here >< for testing.
This is a line after the binary zero.
ABOVE the elephant
ABOVE

12
ext/pcre/pcrelib/testdata/grepinput8 vendored Normal file
View File

@@ -0,0 +1,12 @@
X one
X two X three X four
X five
X six
X seven…X eightX nineX ten
Before 111
Before 222Before 333…Match
After 111
After 222After 333
And so on and so on
And so on and so on

View File

@@ -75,7 +75,14 @@ RC=1
39:nineteen
40:twenty
41:
42:This is the last line of this file.
42:Here follows some CR/LF/CRLF test data.
43:
44:abc
def
45:ghi
46:jkl
47:
48:This is the last line of this file.
---------------------------- Test 12 -----------------------------
Pattern
---------------------------- Test 13 -----------------------------
@@ -157,7 +164,8 @@ eighteen
nineteen
twenty
This is the last line of this file.
Here follows some CR/LF/CRLF test data.
---------------------------- Test 25 -----------------------------
15-
16-complete pair
@@ -207,7 +215,8 @@ eighteen
nineteen
twenty
This is the last line of this file.
Here follows some CR/LF/CRLF test data.
---------------------------- Test 27 -----------------------------
four
five
@@ -227,7 +236,10 @@ eighteen
nineteen
twenty
This is the last line of this file.
Here follows some CR/LF/CRLF test data.
abc
def
---------------------------- Test 28 -----------------------------
14-of lines all by themselves.
15-
@@ -279,7 +291,12 @@ eighteen
nineteen
twenty
This is the last line of this file.
Here follows some CR/LF/CRLF test data.
abc
def
ghi
jkl
---------------------------- Test 30 -----------------------------
./testdata/grepinput-4-features should be added at the end, because some of the tests involve the
./testdata/grepinput-5-output of line numbers, and we don't want these to change.
@@ -329,6 +346,7 @@ RC=2
RC=0
---------------------------- Test 36 -----------------------------
./testdata/grepinputx
./testdata/grepinput8
RC=0
---------------------------- Test 37 -----------------------------
aaaaa0
@@ -342,10 +360,13 @@ pcregrep: check your regex for nested unlimited loops
pcregrep: pcre_exec() error -8 while matching this line:
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
---------------------------- Test 38 ------------------------------
This line contains a binary zero here >---------------------------- Test 39 ------------------------------
This line contains a binary zero here >< for testing.
---------------------------- Test 39 ------------------------------
This is a line before the binary zero.
This line contains a binary zero here >---------------------------- Test 40 ------------------------------
This line contains a binary zero here >This is a line after the binary zero.
This line contains a binary zero here >< for testing.
---------------------------- Test 40 ------------------------------
This line contains a binary zero here >< for testing.
This is a line after the binary zero.
---------------------------- Test 41 ------------------------------
before the binary zero
after the binary zero
@@ -378,3 +399,31 @@ ABOVE the elephant
AB.VE
AB.VE the turtle
PUT NEW DATA ABOVE THIS LINE.
---------------------------- Test 49 ------------------------------
abc
def
ghi
jkl
---------------------------- Test 50 ------------------------------
def
---------------------------- Test 51 ------------------------------
ghi
jkl
This is the last line of this file.
---------------------------- Test 52 ------------------------------
def
ghi
jkl
This is the last line of this file.
---------------------------- Test 53 ------------------------------
ghi
jkl
This is the last line of this file.
---------------------------- Test 54 ------------------------------
44:abc
45:def
46:ghi
47:jkl

11
ext/pcre/pcrelib/testdata/grepoutput8 vendored Normal file
View File

@@ -0,0 +1,11 @@
---------------------------- Test U1 ------------------------------
1:X one
2:X two 3:X three 4:X four
5:X five
6:X six
7:X seven…8:X eight9:X nine10:X ten
---------------------------- Test U2 ------------------------------
12-Before 111
13-Before 22214-Before 333…15:Match
16-After 111
17-After 22218-After 333

View File

@@ -1297,8 +1297,7 @@
abc
/^a b
c/x
c/x
abc
/^(a|)\1*b/
@@ -1454,11 +1453,6 @@
/{4,5a}bc/
{4,5a}bc
/^a.b/
a\rb
*** Failers
a\nb
/abc$/
abc
abc\n
@@ -1500,8 +1494,8 @@
/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\12\123/
abcdefghijk\12S
/ab\gdef/
abgdef
/ab\hdef/
abhdef
/a{0}bc/
bc
@@ -3382,9 +3376,14 @@
cdaccb
/^(?:a?b?)*$/
\
a
ab
aaa
*** Failers
dbcb
a--
aa--
/((?s)^a(.))((?m)^b$)/
a\nb\nc\n
@@ -3884,4 +3883,139 @@
a,b]
[a,b,c]
/(?-x: )/x
A\x20B
"(?x)(?-x: \s*#\s*)"
A # B
** Failers
#
"(?x-is)(?:(?-ixs) \s*#\s*) include"
A #include
** Failers
A#include
A #Include
/a*b*\w/
aaabbbb
aaaa
a
/a*b?\w/
aaabbbb
aaaa
a
/a*b{0,4}\w/
aaabbbb
aaaa
a
/a*b{0,}\w/
aaabbbb
aaaa
a
/a*\d*\w/
0a
a
/a*b *\w/x
a
/a*b#comment
*\w/x
a
/a* b *\w/x
a
/^\w+=.*(\\\n.*)*/
abc=xyz\\\npqr
/(?=(\w+))\1:/
abcd:
/^(?=(\w+))\1:/
abcd:
/^\Eabc/
abc
/^[\Eabc]/
a
** Failers
E
/^[a-\Ec]/
b
** Failers
-
E
/^[a\E\E-\Ec]/
b
** Failers
-
E
/^[\E\Qa\E-\Qz\E]+/
b
** Failers
-
/^[a\Q]bc\E]/
a
]
c
/^[a-\Q\E]/
a
-
/^(a()*)*/
aaaa
/^(?:a(?:(?:))*)*/
aaaa
/^(a()+)+/
aaaa
/^(?:a(?:(?:))+)+/
aaaa
/(a){0,3}(?(1)b|(c|))*D/
abbD
ccccD
D
/(a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?>a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?:a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/\Z/g
abc\n
/^(?s)(?>.*)(?<!\n)/
abc
abc\n
/^(?![^\n]*\n\z)/
abc
abc\n
/\z(?<!\n)/
abc
abc\n
/ End of testinput1 /

File diff suppressed because it is too large Load Diff

View File

@@ -520,4 +520,7 @@
abcdefg
ab
/a*\x{100}*\w/8
a
/ End of testinput4 /

View File

@@ -270,5 +270,89 @@
/\777/8I
\x{1ff}
\777
/\x{100}*\d/8D
/\x{100}*\s/8D
/\x{100}*\w/8D
/\x{100}*\D/8D
/\x{100}*\S/8D
/\x{100}*\W/8D
/\x{100}+\x{200}/8D
/\x{100}+X/8D
/X+\x{200}/8D
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
A (x) (?41) B/8x
AxxB
/^[\x{100}\E-\Q\E\x{150}]/B8
/^[\QÄ€\E-\QÅ<51>\E]/B8
/^[\QÄ€\E-\QÅ<51>\E/B8
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
/abc.$/mgx8<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x{0085} abc7\x{2028} abc8\x{2029} abc9
/^a\Rb/8
a\nb
a\rb
a\r\nb
a\x0bb
a\x0cb
a\x{85}b
a\x{2028}b
a\x{2029}b
** Failers
a\n\rb
/^a\R*b/8
ab
a\nb
a\rb
a\r\nb
a\x0bb
a\x0c\x{2028}\x{2029}b
a\x{85}b
a\n\rb
a\n\r\x{85}\x0cb
/^a\R+b/8
a\nb
a\rb
a\r\nb
a\x0bb
a\x0c\x{2028}\x{2029}b
a\x{85}b
a\n\rb
a\n\r\x{85}\x0cb
** Failers
ab
/^a\R{1,3}b/8
a\nb
a\n\rb
a\n\r\x{85}b
a\r\n\r\nb
a\r\n\r\n\r\nb
a\n\r\n\rb
a\n\n\r\nb
** Failers
a\n\n\n\rb
a\r
/ End of testinput5 /

View File

@@ -747,4 +747,19 @@
/([\pL]=(abc))*X/
L=abcX
/The next two should be Perl-compatible, but it fails to match \x{e0}. PCRE
will match it only with UCP support, because without that it has no notion
of case for anything other than the ASCII letters. /
/((?i)[\x{c0}])/8
\x{c0}
\x{e0}
/(?i:[\x{c0}])/8
\x{c0}
\x{e0}
/^\p{Balinese}\p{Cuneiform}\p{Nko}\p{Phags_Pa}\p{Phoenician}/8
\x{1b00}\x{12000}\x{7c0}\x{a840}\x{10900}
/ End of testinput6 /

View File

@@ -1775,8 +1775,7 @@
abc
/^a b
c/x
c/x
abc
/ab{1,3}bc/
@@ -1889,7 +1888,7 @@
/{4,5a}bc/
{4,5a}bc
/^a.b/
/^a.b/<lf>
a\rb
*** Failers
a\nb
@@ -1932,8 +1931,8 @@
/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\12\123/
abcdefghijk\12S
/ab\gdef/
abgdef
/ab\hdef/
abhdef
/a{0}bc/
bc
@@ -4067,7 +4066,7 @@
xyz\rabc\<crlf>
xyz\rabc\<lf>
/abc$/m
/abc$/m<lf>
xyzabc
xyzabc\n
xyzabc\npqr
@@ -4099,7 +4098,7 @@
** Failers
xyz\rabcdef
/.*/
/.*/<lf>
abc\ndef
abc\rdef
abc\r\ndef
@@ -4115,4 +4114,119 @@
abc\rdef
abc\r\ndef
/^\w+=.*(\\\n.*)*/
abc=xyz\\\npqr
/^(a()*)*/
aaaa
/^(?:a(?:(?:))*)*/
aaaa
/^(a()+)+/
aaaa
/^(?:a(?:(?:))+)+/
aaaa
/(a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?>a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?:a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/^a.b/<lf>
a\rb
a\nb\<cr>
** Failers
a\nb
a\nb\<any>
a\rb\<cr>
a\rb\<any>
/^abc./mgx<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x85abc7 \x{2028}abc8 \x{2029}abc9 JUNK
/abc.$/mgx<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x85 abc7\x{2028} abc8\x{2029} abc9
/^a\Rb/
a\nb
a\rb
a\r\nb
a\x0bb
a\x0cb
a\x85b
** Failers
a\n\rb
/^a\R*b/
ab
a\nb
a\rb
a\r\nb
a\x0bb
a\x0cb
a\x85b
a\n\rb
a\n\r\x85\x0cb
/^a\R+b/
a\nb
a\rb
a\r\nb
a\x0bb
a\x0cb
a\x85b
a\n\rb
a\n\r\x85\x0cb
** Failers
ab
/^a\R{1,3}b/
a\nb
a\n\rb
a\n\r\x85b
a\r\n\r\nb
a\r\n\r\n\r\nb
a\n\r\n\rb
a\n\n\r\nb
** Failers
a\n\n\n\rb
a\r
/^a[\R]b/
aRb
** Failers
a\nb
/.+foo/
afoo
** Failers
\r\nfoo
\nfoo
/.+foo/<crlf>
afoo
\nfoo
** Failers
\r\nfoo
/.+foo/<any>
afoo
** Failers
\nfoo
\r\nfoo
/.+foo/s
afoo
\r\nfoo
\nfoo
/ End of testinput7 /

View File

@@ -537,4 +537,57 @@
/^\x{85}$/8i
\x{85}
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
/abc.$/mgx8<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x{0085} abc7\x{2028} abc8\x{2029} abc9
/^a\Rb/8
a\nb
a\rb
a\r\nb
a\x0bb
a\x0cb
a\x{85}b
a\x{2028}b
a\x{2029}b
** Failers
a\n\rb
/^a\R*b/8
ab
a\nb
a\rb
a\r\nb
a\x0bb
a\x0c\x{2028}\x{2029}b
a\x{85}b
a\n\rb
a\n\r\x{85}\x0cb
/^a\R+b/8
a\nb
a\rb
a\r\nb
a\x0bb
a\x0c\x{2028}\x{2029}b
a\x{85}b
a\n\rb
a\n\r\x{85}\x0cb
** Failers
ab
/^a\R{1,3}b/8
a\nb
a\n\rb
a\n\r\x{85}b
a\r\n\r\nb
a\r\n\r\n\r\nb
a\n\r\n\rb
a\n\n\r\nb
** Failers
a\n\n\n\rb
a\r
/ End of testinput 8 /

View File

@@ -1817,8 +1817,7 @@ No match
No match
/^a b
c/x
c/x
abc
0: abc
@@ -2094,14 +2093,6 @@ No match
{4,5a}bc
0: {4,5a}bc
/^a.b/
a\rb
0: a\x0db
*** Failers
No match
a\nb
No match
/abc$/
abc
0: abc
@@ -2198,9 +2189,9 @@ No match
10: j
11: k
/ab\gdef/
abgdef
0: abgdef
/ab\hdef/
abhdef
0: abhdef
/a{0}bc/
bc
@@ -5481,12 +5472,22 @@ No match
0: b
/^(?:a?b?)*$/
\
0:
a
0: a
ab
0: ab
aaa
0: aaa
*** Failers
No match
dbcb
No match
a--
No match
aa--
No match
/((?s)^a(.))((?m)^b$)/
a\nb\nc\n
@@ -6354,4 +6355,220 @@ No match
[a,b,c]
0: [a,b,c]
/(?-x: )/x
A\x20B
0:
"(?x)(?-x: \s*#\s*)"
A # B
0: #
** Failers
No match
#
No match
"(?x-is)(?:(?-ixs) \s*#\s*) include"
A #include
0: #include
** Failers
No match
A#include
No match
A #Include
No match
/a*b*\w/
aaabbbb
0: aaabbbb
aaaa
0: aaaa
a
0: a
/a*b?\w/
aaabbbb
0: aaabb
aaaa
0: aaaa
a
0: a
/a*b{0,4}\w/
aaabbbb
0: aaabbbb
aaaa
0: aaaa
a
0: a
/a*b{0,}\w/
aaabbbb
0: aaabbbb
aaaa
0: aaaa
a
0: a
/a*\d*\w/
0a
0: 0a
a
0: a
/a*b *\w/x
a
0: a
/a*b#comment
*\w/x
a
0: a
/a* b *\w/x
a
0: a
/^\w+=.*(\\\n.*)*/
abc=xyz\\\npqr
0: abc=xyz\
/(?=(\w+))\1:/
abcd:
0: abcd:
1: abcd
/^(?=(\w+))\1:/
abcd:
0: abcd:
1: abcd
/^\Eabc/
abc
0: abc
/^[\Eabc]/
a
0: a
** Failers
No match
E
No match
/^[a-\Ec]/
b
0: b
** Failers
No match
-
No match
E
No match
/^[a\E\E-\Ec]/
b
0: b
** Failers
No match
-
No match
E
No match
/^[\E\Qa\E-\Qz\E]+/
b
0: b
** Failers
No match
-
No match
/^[a\Q]bc\E]/
a
0: a
]
0: ]
c
0: c
/^[a-\Q\E]/
a
0: a
-
0: -
/^(a()*)*/
aaaa
0: aaaa
1: a
2:
/^(?:a(?:(?:))*)*/
aaaa
0: aaaa
/^(a()+)+/
aaaa
0: aaaa
1: a
2:
/^(?:a(?:(?:))+)+/
aaaa
0: aaaa
/(a){0,3}(?(1)b|(c|))*D/
abbD
0: abbD
1: a
ccccD
0: ccccD
1: <unset>
2:
D
0: D
1: <unset>
2:
/(a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
1:
/(?>a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?:a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/\Z/g
abc\n
0:
0:
/^(?s)(?>.*)(?<!\n)/
abc
0: abc
abc\n
No match
/^(?![^\n]*\n\z)/
abc
0:
abc\n
No match
/\z(?<!\n)/
abc
0:
abc\n
No match
/ End of testinput1 /

File diff suppressed because it is too large Load Diff

View File

@@ -149,9 +149,9 @@ No match
/[[:alpha:]][[:lower:]][[:upper:]]/DLfr_FR
------------------------------------------------------------------
0 102 Bra 0
3 [A-Za-zªµºÀ-ÖØ-öø-ÿ]
36 [a-zµß-öø-ÿ]
69 [A-ZÀ-ÖØ-Þ]
3 [A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\xff]
36 [a-z\xb5\xdf-\xf6\xf8-\xff]
69 [A-Z\xc0-\xd6\xd8-\xde]
102 102 Ket
105 End
------------------------------------------------------------------

View File

@@ -914,4 +914,8 @@ No match
0: ab
1: ab
/a*\x{100}*\w/8
a
0: a
/ End of testinput4 /

View File

@@ -103,7 +103,7 @@ First char = 195
Need char = 191
/[\x{100}]/8DM
Memory allocation (code space): 47
Memory allocation (code space): 15
------------------------------------------------------------------
0 11 Bra 0
3 [\x{100}]
@@ -429,11 +429,11 @@ No match
/Ä€{3,4}/8SD
------------------------------------------------------------------
0 13 Bra 0
0 11 Bra 0
3 \x{100}{3}
8 \x{100}{,1}
13 13 Ket
16 End
8 \x{100}?
11 11 Ket
14 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
@@ -445,29 +445,10 @@ Study returned NULL
0: \x{100}\x{100}\x{100}
/(\x{100}+|x)/8SD
------------------------------------------------------------------
0 17 Bra 0
3 6 Bra 1
6 \x{100}+
9 5 Alt
12 x
14 11 Ket
17 17 Ket
20 End
------------------------------------------------------------------
Capturing subpattern count = 1
Partial matching not supported
Options: utf8
No first char
No need char
Starting byte set: x \xc4
/(\x{100}*a|x)/8SD
------------------------------------------------------------------
0 19 Bra 0
3 8 Bra 1
6 \x{100}*
9 a
8 \x{100}+
11 5 Alt
14 x
16 13 Ket
@@ -479,13 +460,13 @@ Partial matching not supported
Options: utf8
No first char
No need char
Starting byte set: a x \xc4
Starting byte set: x \xc4
/(\x{100}{0,2}a|x)/8SD
/(\x{100}*a|x)/8SD
------------------------------------------------------------------
0 21 Bra 0
3 10 Bra 1
6 \x{100}{,2}
8 \x{100}*+
11 a
13 5 Alt
16 x
@@ -500,18 +481,37 @@ No first char
No need char
Starting byte set: a x \xc4
/(\x{100}{0,2}a|x)/8SD
------------------------------------------------------------------
0 23 Bra 0
3 12 Bra 1
8 \x{100}{0,2}
13 a
15 5 Alt
18 x
20 17 Ket
23 23 Ket
26 End
------------------------------------------------------------------
Capturing subpattern count = 1
Partial matching not supported
Options: utf8
No first char
No need char
Starting byte set: a x \xc4
/(\x{100}{1,2}a|x)/8SD
------------------------------------------------------------------
0 24 Bra 0
3 13 Bra 1
6 \x{100}
9 \x{100}{,1}
14 a
16 5 Alt
19 x
21 18 Ket
24 24 Ket
27 End
0 26 Bra 0
3 15 Bra 1
8 \x{100}
11 \x{100}{0,1}
16 a
18 5 Alt
21 x
23 20 Ket
26 26 Ket
29 End
------------------------------------------------------------------
Capturing subpattern count = 1
Partial matching not supported
@@ -628,7 +628,7 @@ Need char = 129
/\x{100}*A/8D
------------------------------------------------------------------
0 8 Bra 0
3 \x{100}*
3 \x{100}*+
6 A
8 8 Ket
11 End
@@ -644,7 +644,7 @@ Need char = 'A'
/\x{100}*\d(?R)/8D
------------------------------------------------------------------
0 16 Bra 0
3 \x{100}*
3 \x{100}*+
6 \d
7 6 Once
10 0 Recurse
@@ -683,7 +683,7 @@ No first char
No need char
/[\x{100}]/8DM
Memory allocation (code space): 47
Memory allocation (code space): 15
------------------------------------------------------------------
0 11 Bra 0
3 [\x{100}]
@@ -912,16 +912,16 @@ No match
/\x{100}abc(xyz(?1))/8D
------------------------------------------------------------------
0 33 Bra 0
0 35 Bra 0
3 \x{100}abc
12 18 Bra 1
15 xyz
21 6 Once
24 12 Recurse
27 6 Ket
30 18 Ket
33 33 Ket
36 End
12 20 Bra 1
17 xyz
23 6 Once
26 12 Recurse
29 6 Ket
32 20 Ket
35 35 Ket
38 End
------------------------------------------------------------------
Capturing subpattern count = 1
Options: utf8
@@ -930,17 +930,17 @@ Need char = 'z'
/[^\x{100}]abc(xyz(?1))/8D
------------------------------------------------------------------
0 38 Bra 0
0 40 Bra 0
3 [^\x{100}]
11 abc
17 18 Bra 1
20 xyz
26 6 Once
29 17 Recurse
32 6 Ket
35 18 Ket
38 38 Ket
41 End
17 20 Bra 1
22 xyz
28 6 Once
31 17 Recurse
34 6 Ket
37 20 Ket
40 40 Ket
43 End
------------------------------------------------------------------
Capturing subpattern count = 1
Options: utf8
@@ -949,17 +949,17 @@ Need char = 'z'
/[ab\x{100}]abc(xyz(?1))/8D
------------------------------------------------------------------
0 70 Bra 0
0 72 Bra 0
3 [ab\x{100}]
43 abc
49 18 Bra 1
52 xyz
58 6 Once
61 49 Recurse
64 6 Ket
67 18 Ket
70 70 Ket
73 End
49 20 Bra 1
54 xyz
60 6 Once
63 49 Recurse
66 6 Ket
69 20 Ket
72 72 Ket
75 End
------------------------------------------------------------------
Capturing subpattern count = 1
Options: utf8
@@ -968,20 +968,20 @@ Need char = 'z'
/(\x{100}(b(?2)c))?/D8
------------------------------------------------------------------
0 32 Bra 0
0 36 Bra 0
3 Brazero
4 25 Bra 1
7 \x{100}
10 16 Bra 2
13 b
15 6 Once
18 10 Recurse
21 6 Ket
24 c
26 16 Ket
29 25 Ket
32 32 Ket
35 End
4 29 Bra 1
9 \x{100}
12 18 Bra 2
17 b
19 6 Once
22 12 Recurse
25 6 Ket
28 c
30 18 Ket
33 29 Ket
36 36 Ket
39 End
------------------------------------------------------------------
Capturing subpattern count = 2
Options: utf8
@@ -990,33 +990,33 @@ No need char
/(\x{100}(b(?2)c)){0,2}/D8
------------------------------------------------------------------
0 67 Bra 0
0 75 Bra 0
3 Brazero
4 60 Bra 0
7 25 Bra 1
10 \x{100}
13 16 Bra 2
16 b
18 6 Once
21 13 Recurse
24 6 Ket
27 c
29 16 Ket
32 25 Ket
35 Brazero
36 25 Bra 1
39 \x{100}
42 16 Bra 2
45 b
47 6 Once
50 13 Recurse
53 6 Ket
56 c
58 16 Ket
61 25 Ket
64 60 Ket
67 67 Ket
70 End
4 68 Bra 0
7 29 Bra 1
12 \x{100}
15 18 Bra 2
20 b
22 6 Once
25 15 Recurse
28 6 Ket
31 c
33 18 Ket
36 29 Ket
39 Brazero
40 29 Bra 1
45 \x{100}
48 18 Bra 2
53 b
55 6 Once
58 15 Recurse
61 6 Ket
64 c
66 18 Ket
69 29 Ket
72 68 Ket
75 75 Ket
78 End
------------------------------------------------------------------
Capturing subpattern count = 2
Options: utf8
@@ -1025,20 +1025,20 @@ No need char
/(\x{100}(b(?1)c))?/D8
------------------------------------------------------------------
0 32 Bra 0
0 36 Bra 0
3 Brazero
4 25 Bra 1
7 \x{100}
10 16 Bra 2
13 b
15 6 Once
18 4 Recurse
21 6 Ket
24 c
26 16 Ket
29 25 Ket
32 32 Ket
35 End
4 29 Bra 1
9 \x{100}
12 18 Bra 2
17 b
19 6 Once
22 4 Recurse
25 6 Ket
28 c
30 18 Ket
33 29 Ket
36 36 Ket
39 End
------------------------------------------------------------------
Capturing subpattern count = 2
Options: utf8
@@ -1047,33 +1047,33 @@ No need char
/(\x{100}(b(?1)c)){0,2}/D8
------------------------------------------------------------------
0 67 Bra 0
0 75 Bra 0
3 Brazero
4 60 Bra 0
7 25 Bra 1
10 \x{100}
13 16 Bra 2
16 b
18 6 Once
21 7 Recurse
24 6 Ket
27 c
29 16 Ket
32 25 Ket
35 Brazero
36 25 Bra 1
39 \x{100}
42 16 Bra 2
45 b
47 6 Once
50 7 Recurse
53 6 Ket
56 c
58 16 Ket
61 25 Ket
64 60 Ket
67 67 Ket
70 End
4 68 Bra 0
7 29 Bra 1
12 \x{100}
15 18 Bra 2
20 b
22 6 Once
25 7 Recurse
28 6 Ket
31 c
33 18 Ket
36 29 Ket
39 Brazero
40 29 Bra 1
45 \x{100}
48 18 Bra 2
53 b
55 6 Once
58 7 Recurse
61 6 Ket
64 c
66 18 Ket
69 29 Ket
72 68 Ket
75 75 Ket
78 End
------------------------------------------------------------------
Capturing subpattern count = 2
Options: utf8
@@ -1119,5 +1119,285 @@ Need char = 191
0: \x{1ff}
\777
0: \x{1ff}
/\x{100}*\d/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*+
6 \d
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}*\s/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*+
6 \s
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}*\w/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*+
6 \w
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}*\D/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*
6 \D
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}*\S/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*
6 \S
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}*\W/8D
------------------------------------------------------------------
0 7 Bra 0
3 \x{100}*
6 \W
7 7 Ket
10 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
No first char
No need char
/\x{100}+\x{200}/8D
------------------------------------------------------------------
0 9 Bra 0
3 \x{100}++
6 \x{200}
9 9 Ket
12 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
First char = 196
Need char = 128
/\x{100}+X/8D
------------------------------------------------------------------
0 8 Bra 0
3 \x{100}++
6 X
8 8 Ket
11 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
First char = 196
Need char = 'X'
/X+\x{200}/8D
------------------------------------------------------------------
0 8 Bra 0
3 X++
5 \x{200}
8 8 Ket
11 End
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
Options: utf8
First char = 'X'
Need char = 128
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
A (x) (?41) B/8x
AxxB
Matched, but too many substrings
0: AxxB
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
/^[\x{100}\E-\Q\E\x{150}]/B8
------------------------------------------------------------------
0 14 Bra 0
3 ^
4 [\x{100}-\x{150}]
14 14 Ket
17 End
------------------------------------------------------------------
/^[\QÄ€\E-\QÅ<51>\E]/B8
------------------------------------------------------------------
0 14 Bra 0
3 ^
4 [\x{100}-\x{150}]
14 14 Ket
17 End
------------------------------------------------------------------
/^[\QÄ€\E-\QÅ<51>\E/B8
Failed: missing terminating ] for character class at offset 15
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc7
0: abc8
0: abc9
/abc.$/mgx8<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x{0085} abc7\x{2028} abc8\x{2029} abc9
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc7
0: abc8
0: abc9
/^a\Rb/8
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0cb
0: a\x{0c}b
a\x{85}b
0: a\x{85}b
a\x{2028}b
0: a\x{2028}b
a\x{2029}b
0: a\x{2029}b
** Failers
No match
a\n\rb
No match
/^a\R*b/8
ab
0: ab
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0c\x{2028}\x{2029}b
0: a\x{0c}\x{2028}\x{2029}b
a\x{85}b
0: a\x{85}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}\x0cb
0: a\x{0a}\x{0d}\x{85}\x{0c}b
/^a\R+b/8
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0c\x{2028}\x{2029}b
0: a\x{0c}\x{2028}\x{2029}b
a\x{85}b
0: a\x{85}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}\x0cb
0: a\x{0a}\x{0d}\x{85}\x{0c}b
** Failers
No match
ab
No match
/^a\R{1,3}b/8
a\nb
0: a\x{0a}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}b
0: a\x{0a}\x{0d}\x{85}b
a\r\n\r\nb
0: a\x{0d}\x{0a}\x{0d}\x{0a}b
a\r\n\r\n\r\nb
0: a\x{0d}\x{0a}\x{0d}\x{0a}\x{0d}\x{0a}b
a\n\r\n\rb
0: a\x{0a}\x{0d}\x{0a}\x{0d}b
a\n\n\r\nb
0: a\x{0a}\x{0a}\x{0d}\x{0a}b
** Failers
No match
a\n\n\n\rb
No match
a\r
No match
/ End of testinput5 /

View File

@@ -609,7 +609,7 @@ No first char
No need char
/[\p{Nd}]/8DM
Memory allocation (code space): 47
Memory allocation (code space): 15
------------------------------------------------------------------
0 11 Bra 0
3 [\p{Nd}]
@@ -1410,4 +1410,26 @@ No match
1: L=abc
2: abc
/The next two should be Perl-compatible, but it fails to match \x{e0}. PCRE
will match it only with UCP support, because without that it has no notion
of case for anything other than the ASCII letters. /
/((?i)[\x{c0}])/8
\x{c0}
0: \x{c0}
1: \x{c0}
\x{e0}
0: \x{e0}
1: \x{e0}
/(?i:[\x{c0}])/8
\x{c0}
0: \x{c0}
\x{e0}
0: \x{e0}
/^\p{Balinese}\p{Cuneiform}\p{Nko}\p{Phags_Pa}\p{Phoenician}/8
\x{1b00}\x{12000}\x{7c0}\x{a840}\x{10900}
0: \x{1b00}\x{12000}\x{7c0}\x{a840}\x{10900}
/ End of testinput6 /

View File

@@ -2735,8 +2735,7 @@ No match
No match
/^a b
c/x
c/x
abc
0: abc
@@ -2974,7 +2973,7 @@ No match
{4,5a}bc
0: {4,5a}bc
/^a.b/
/^a.b/<lf>
a\rb
0: a\x0db
*** Failers
@@ -3040,9 +3039,9 @@ No match
abcdefghijk\12S
0: abcdefghijk\x0aS
/ab\gdef/
abgdef
0: abgdef
/ab\hdef/
abhdef
0: abhdef
/a{0}bc/
bc
@@ -6601,7 +6600,7 @@ No match
xyz\rabc\<lf>
No match
/abc$/m
/abc$/m<lf>
xyzabc
0: abc
xyzabc\n
@@ -6657,7 +6656,7 @@ No match
xyz\rabcdef
No match
/.*/
/.*/<lf>
abc\ndef
0: abc
1: ab
@@ -6729,4 +6728,228 @@ No match
abc\r\ndef
0: abc\x0d\x0adef
/^\w+=.*(\\\n.*)*/
abc=xyz\\\npqr
0: abc=xyz\\x0apqr
1: abc=xyz\\x0apq
2: abc=xyz\\x0ap
3: abc=xyz\\x0a
4: abc=xyz\
5: abc=xyz
6: abc=xy
7: abc=x
8: abc=
/^(a()*)*/
aaaa
0: aaaa
1: aaa
2: aa
3: a
4:
/^(?:a(?:(?:))*)*/
aaaa
0: aaaa
1: aaa
2: aa
3: a
4:
/^(a()+)+/
aaaa
0: aaaa
1: aaa
2: aa
3: a
/^(?:a(?:(?:))+)+/
aaaa
0: aaaa
1: aaa
2: aa
3: a
/(a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?>a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/(?:a|)*\d/
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
No match
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4
/^a.b/<lf>
a\rb
0: a\x0db
a\nb\<cr>
0: a\x0ab
** Failers
No match
a\nb
No match
a\nb\<any>
No match
a\rb\<cr>
No match
a\rb\<any>
No match
/^abc./mgx<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x85abc7 \x{2028}abc8 \x{2029}abc9 JUNK
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc7
/abc.$/mgx<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x85 abc7\x{2028} abc8\x{2029} abc9
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc9
/^a\Rb/
a\nb
0: a\x0ab
a\rb
0: a\x0db
a\r\nb
0: a\x0d\x0ab
a\x0bb
0: a\x0bb
a\x0cb
0: a\x0cb
a\x85b
0: a\x85b
** Failers
No match
a\n\rb
No match
/^a\R*b/
ab
0: ab
a\nb
0: a\x0ab
a\rb
0: a\x0db
a\r\nb
0: a\x0d\x0ab
a\x0bb
0: a\x0bb
a\x0cb
0: a\x0cb
a\x85b
0: a\x85b
a\n\rb
0: a\x0a\x0db
a\n\r\x85\x0cb
0: a\x0a\x0d\x85\x0cb
/^a\R+b/
a\nb
0: a\x0ab
a\rb
0: a\x0db
a\r\nb
0: a\x0d\x0ab
a\x0bb
0: a\x0bb
a\x0cb
0: a\x0cb
a\x85b
0: a\x85b
a\n\rb
0: a\x0a\x0db
a\n\r\x85\x0cb
0: a\x0a\x0d\x85\x0cb
** Failers
No match
ab
No match
/^a\R{1,3}b/
a\nb
0: a\x0ab
a\n\rb
0: a\x0a\x0db
a\n\r\x85b
0: a\x0a\x0d\x85b
a\r\n\r\nb
0: a\x0d\x0a\x0d\x0ab
a\r\n\r\n\r\nb
0: a\x0d\x0a\x0d\x0a\x0d\x0ab
a\n\r\n\rb
0: a\x0a\x0d\x0a\x0db
a\n\n\r\nb
0: a\x0a\x0a\x0d\x0ab
** Failers
No match
a\n\n\n\rb
No match
a\r
No match
/^a[\R]b/
aRb
0: aRb
** Failers
No match
a\nb
No match
/.+foo/
afoo
0: afoo
** Failers
No match
\r\nfoo
No match
\nfoo
No match
/.+foo/<crlf>
afoo
0: afoo
\nfoo
0: \x0afoo
** Failers
No match
\r\nfoo
No match
/.+foo/<any>
afoo
0: afoo
** Failers
No match
\nfoo
No match
\r\nfoo
No match
/.+foo/s
afoo
0: afoo
\r\nfoo
0: \x0d\x0afoo
\nfoo
0: \x0afoo
/ End of testinput7 /

View File

@@ -1028,4 +1028,114 @@ No match
\x{85}
0: \x{85}
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc7
0: abc8
0: abc9
/abc.$/mgx8<any>
abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x{0085} abc7\x{2028} abc8\x{2029} abc9
0: abc1
0: abc2
0: abc3
0: abc4
0: abc5
0: abc6
0: abc7
0: abc8
0: abc9
/^a\Rb/8
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0cb
0: a\x{0c}b
a\x{85}b
0: a\x{85}b
a\x{2028}b
0: a\x{2028}b
a\x{2029}b
0: a\x{2029}b
** Failers
No match
a\n\rb
No match
/^a\R*b/8
ab
0: ab
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0c\x{2028}\x{2029}b
0: a\x{0c}\x{2028}\x{2029}b
a\x{85}b
0: a\x{85}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}\x0cb
0: a\x{0a}\x{0d}\x{85}\x{0c}b
/^a\R+b/8
a\nb
0: a\x{0a}b
a\rb
0: a\x{0d}b
a\r\nb
0: a\x{0d}\x{0a}b
a\x0bb
0: a\x{0b}b
a\x0c\x{2028}\x{2029}b
0: a\x{0c}\x{2028}\x{2029}b
a\x{85}b
0: a\x{85}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}\x0cb
0: a\x{0a}\x{0d}\x{85}\x{0c}b
** Failers
No match
ab
No match
/^a\R{1,3}b/8
a\nb
0: a\x{0a}b
a\n\rb
0: a\x{0a}\x{0d}b
a\n\r\x{85}b
0: a\x{0a}\x{0d}\x{85}b
a\r\n\r\nb
0: a\x{0d}\x{0a}\x{0d}\x{0a}b
a\r\n\r\n\r\nb
0: a\x{0d}\x{0a}\x{0d}\x{0a}\x{0d}\x{0a}b
a\n\r\n\rb
0: a\x{0a}\x{0d}\x{0a}\x{0d}b
a\n\n\r\nb
0: a\x{0a}\x{0a}\x{0d}\x{0a}b
** Failers
No match
a\n\n\n\rb
No match
a\r
No match
/ End of testinput 8 /

View File

@@ -6,7 +6,9 @@
#define _UCP_H
/* This file contains definitions of the property values that are returned by
the function _pcre_ucp_findprop(). */
the function _pcre_ucp_findprop(). New values that are added for new releases
of Unicode should always be at the end of each enum, for backwards
compatibility. */
/* These are the general character categories. */
@@ -118,7 +120,12 @@ enum {
ucp_Tibetan,
ucp_Tifinagh,
ucp_Ugaritic,
ucp_Yi
ucp_Yi,
ucp_Balinese, /* New for Unicode 5.0.0 */
ucp_Cuneiform, /* New for Unicode 5.0.0 */
ucp_Nko, /* New for Unicode 5.0.0 */
ucp_Phags_Pa, /* New for Unicode 5.0.0 */
ucp_Phoenician /* New for Unicode 5.0.0 */
};
#endif

View File

@@ -2,6 +2,9 @@
* Unicode Property Table handler *
*************************************************/
#ifndef _UCPINTERNAL_H
#define _UCPINTERNAL_H
/* Internal header file defining the layout of the bits in each pair of 32-bit
words that form a data item in the table. */
@@ -84,4 +87,6 @@ When searching the data, proceed as follows:
(2).
*/
#endif /* _UCPINTERNAL_H */
/* End of ucpinternal.h */

View File

@@ -1,5 +1,6 @@
/* This source module is automatically generated from the Unicode
property table. See ucpinternal.h for a description of the layout. */
property table. See ucpinternal.h for a description of the layout.
This version was made from the Unicode 5.0.0 tables. */
static cnode ucp_table[] = {
{ 0x09800000, 0x0000001f },
@@ -298,7 +299,7 @@ static cnode ucp_table[] = {
{ 0x2100017d, 0x24000001 },
{ 0x2100017e, 0x1400ffff },
{ 0x2100017f, 0x1400fed4 },
{ 0x21000180, 0x14000000 },
{ 0x21000180, 0x140000c3 },
{ 0x21000181, 0x240000d2 },
{ 0x21000182, 0x24000001 },
{ 0x21000183, 0x1400ffff },
@@ -475,13 +476,27 @@ static cnode ucp_table[] = {
{ 0x21000232, 0x24000001 },
{ 0x21000233, 0x1400ffff },
{ 0x21800234, 0x14000005 },
{ 0x2100023a, 0x24000000 },
{ 0x2100023a, 0x24002a2b },
{ 0x2100023b, 0x24000001 },
{ 0x2100023c, 0x1400ffff },
{ 0x2100023d, 0x2400ff5d },
{ 0x2100023e, 0x24000000 },
{ 0x2100023e, 0x24002a28 },
{ 0x2180023f, 0x14000001 },
{ 0x21000241, 0x24000053 },
{ 0x21000241, 0x24000001 },
{ 0x21000242, 0x1400ffff },
{ 0x21000243, 0x2400ff3d },
{ 0x21000244, 0x24000045 },
{ 0x21000245, 0x24000047 },
{ 0x21000246, 0x24000001 },
{ 0x21000247, 0x1400ffff },
{ 0x21000248, 0x24000001 },
{ 0x21000249, 0x1400ffff },
{ 0x2100024a, 0x24000001 },
{ 0x2100024b, 0x1400ffff },
{ 0x2100024c, 0x24000001 },
{ 0x2100024d, 0x1400ffff },
{ 0x2100024e, 0x24000001 },
{ 0x2100024f, 0x1400ffff },
{ 0x21800250, 0x14000002 },
{ 0x21000253, 0x1400ff2e },
{ 0x21000254, 0x1400ff32 },
@@ -499,25 +514,30 @@ static cnode ucp_table[] = {
{ 0x21800264, 0x14000003 },
{ 0x21000268, 0x1400ff2f },
{ 0x21000269, 0x1400ff2d },
{ 0x2180026a, 0x14000004 },
{ 0x2100026a, 0x14000000 },
{ 0x2100026b, 0x140029f7 },
{ 0x2180026c, 0x14000002 },
{ 0x2100026f, 0x1400ff2d },
{ 0x21800270, 0x14000001 },
{ 0x21000272, 0x1400ff2b },
{ 0x21800273, 0x14000001 },
{ 0x21000275, 0x1400ff2a },
{ 0x21800276, 0x14000009 },
{ 0x21800276, 0x14000006 },
{ 0x2100027d, 0x140029e7 },
{ 0x2180027e, 0x14000001 },
{ 0x21000280, 0x1400ff26 },
{ 0x21800281, 0x14000001 },
{ 0x21000283, 0x1400ff26 },
{ 0x21800284, 0x14000003 },
{ 0x21000288, 0x1400ff26 },
{ 0x21000289, 0x14000000 },
{ 0x21000289, 0x1400ffbb },
{ 0x2100028a, 0x1400ff27 },
{ 0x2100028b, 0x1400ff27 },
{ 0x2180028c, 0x14000005 },
{ 0x2100028c, 0x1400ffb9 },
{ 0x2180028d, 0x14000004 },
{ 0x21000292, 0x1400ff25 },
{ 0x21000293, 0x14000000 },
{ 0x21000294, 0x1400ffad },
{ 0x21000294, 0x1c000000 },
{ 0x21800295, 0x1400001a },
{ 0x218002b0, 0x18000011 },
{ 0x098002c2, 0x60000003 },
@@ -532,6 +552,9 @@ static cnode ucp_table[] = {
{ 0x1b800346, 0x30000029 },
{ 0x13800374, 0x60000001 },
{ 0x1300037a, 0x18000000 },
{ 0x1300037b, 0x14000082 },
{ 0x1300037c, 0x14000082 },
{ 0x1300037d, 0x14000082 },
{ 0x0900037e, 0x54000000 },
{ 0x13800384, 0x60000001 },
{ 0x13000386, 0x24000026 },
@@ -647,7 +670,9 @@ static cnode ucp_table[] = {
{ 0x130003fa, 0x24000001 },
{ 0x130003fb, 0x1400ffff },
{ 0x130003fc, 0x14000000 },
{ 0x138003fd, 0x24000002 },
{ 0x130003fd, 0x2400ff7e },
{ 0x130003fe, 0x2400ff7e },
{ 0x130003ff, 0x2400ff7e },
{ 0x0c000400, 0x24000050 },
{ 0x0c000401, 0x24000050 },
{ 0x0c000402, 0x24000050 },
@@ -835,7 +860,7 @@ static cnode ucp_table[] = {
{ 0x0c0004bd, 0x1400ffff },
{ 0x0c0004be, 0x24000001 },
{ 0x0c0004bf, 0x1400ffff },
{ 0x0c0004c0, 0x24000000 },
{ 0x0c0004c0, 0x2400000f },
{ 0x0c0004c1, 0x24000001 },
{ 0x0c0004c2, 0x1400ffff },
{ 0x0c0004c3, 0x24000001 },
@@ -850,6 +875,7 @@ static cnode ucp_table[] = {
{ 0x0c0004cc, 0x1400ffff },
{ 0x0c0004cd, 0x24000001 },
{ 0x0c0004ce, 0x1400ffff },
{ 0x0c0004cf, 0x1400fff1 },
{ 0x0c0004d0, 0x24000001 },
{ 0x0c0004d1, 0x1400ffff },
{ 0x0c0004d2, 0x24000001 },
@@ -892,6 +918,12 @@ static cnode ucp_table[] = {
{ 0x0c0004f7, 0x1400ffff },
{ 0x0c0004f8, 0x24000001 },
{ 0x0c0004f9, 0x1400ffff },
{ 0x0c0004fa, 0x24000001 },
{ 0x0c0004fb, 0x1400ffff },
{ 0x0c0004fc, 0x24000001 },
{ 0x0c0004fd, 0x1400ffff },
{ 0x0c0004fe, 0x24000001 },
{ 0x0c0004ff, 0x1400ffff },
{ 0x0c000500, 0x24000001 },
{ 0x0c000501, 0x1400ffff },
{ 0x0c000502, 0x24000001 },
@@ -908,6 +940,10 @@ static cnode ucp_table[] = {
{ 0x0c00050d, 0x1400ffff },
{ 0x0c00050e, 0x24000001 },
{ 0x0c00050f, 0x1400ffff },
{ 0x0c000510, 0x24000001 },
{ 0x0c000511, 0x1400ffff },
{ 0x0c000512, 0x24000001 },
{ 0x0c000513, 0x1400ffff },
{ 0x01000531, 0x24000030 },
{ 0x01000532, 0x24000030 },
{ 0x01000533, 0x24000030 },
@@ -989,8 +1025,7 @@ static cnode ucp_table[] = {
{ 0x01000587, 0x14000000 },
{ 0x09000589, 0x54000000 },
{ 0x0100058a, 0x44000000 },
{ 0x19800591, 0x30000028 },
{ 0x198005bb, 0x30000002 },
{ 0x19800591, 0x3000002c },
{ 0x190005be, 0x54000000 },
{ 0x190005bf, 0x30000000 },
{ 0x190005c0, 0x54000000 },
@@ -1043,6 +1078,13 @@ static cnode ucp_table[] = {
{ 0x37800780, 0x1c000025 },
{ 0x378007a6, 0x3000000a },
{ 0x370007b1, 0x1c000000 },
{ 0x3f8007c0, 0x34000009 },
{ 0x3f8007ca, 0x1c000020 },
{ 0x3f8007eb, 0x30000008 },
{ 0x3f8007f4, 0x18000001 },
{ 0x3f0007f6, 0x68000000 },
{ 0x3f8007f7, 0x54000002 },
{ 0x3f0007fa, 0x18000000 },
{ 0x0e800901, 0x30000001 },
{ 0x0e000903, 0x28000000 },
{ 0x0e800904, 0x1c000035 },
@@ -1059,7 +1101,7 @@ static cnode ucp_table[] = {
{ 0x09800964, 0x54000001 },
{ 0x0e800966, 0x34000009 },
{ 0x09000970, 0x54000000 },
{ 0x0e00097d, 0x1c000000 },
{ 0x0e80097b, 0x1c000004 },
{ 0x02000981, 0x30000000 },
{ 0x02800982, 0x28000001 },
{ 0x02800985, 0x1c000007 },
@@ -1203,7 +1245,9 @@ static cnode ucp_table[] = {
{ 0x1c800cd5, 0x28000001 },
{ 0x1c000cde, 0x1c000000 },
{ 0x1c800ce0, 0x1c000001 },
{ 0x1c800ce2, 0x30000001 },
{ 0x1c800ce6, 0x34000009 },
{ 0x1c800cf1, 0x68000001 },
{ 0x24800d02, 0x28000001 },
{ 0x24800d05, 0x1c000007 },
{ 0x24800d0e, 0x1c000002 },
@@ -1452,13 +1496,33 @@ static cnode ucp_table[] = {
{ 0x05801a17, 0x30000001 },
{ 0x05801a19, 0x28000002 },
{ 0x05801a1e, 0x54000001 },
{ 0x3d801b00, 0x30000003 },
{ 0x3d001b04, 0x28000000 },
{ 0x3d801b05, 0x1c00002e },
{ 0x3d001b34, 0x30000000 },
{ 0x3d001b35, 0x28000000 },
{ 0x3d801b36, 0x30000004 },
{ 0x3d001b3b, 0x28000000 },
{ 0x3d001b3c, 0x30000000 },
{ 0x3d801b3d, 0x28000004 },
{ 0x3d001b42, 0x30000000 },
{ 0x3d801b43, 0x28000001 },
{ 0x3d801b45, 0x1c000006 },
{ 0x3d801b50, 0x34000009 },
{ 0x3d801b5a, 0x54000006 },
{ 0x3d801b61, 0x68000009 },
{ 0x3d801b6b, 0x30000008 },
{ 0x3d801b74, 0x68000008 },
{ 0x21801d00, 0x1400002b },
{ 0x21801d2c, 0x18000035 },
{ 0x21801d62, 0x14000015 },
{ 0x0c001d78, 0x18000000 },
{ 0x21801d79, 0x14000021 },
{ 0x21801d79, 0x14000003 },
{ 0x21001d7d, 0x14000ee6 },
{ 0x21801d7e, 0x1400001c },
{ 0x21801d9b, 0x18000024 },
{ 0x1b801dc0, 0x30000003 },
{ 0x1b801dc0, 0x3000000a },
{ 0x1b801dfe, 0x30000001 },
{ 0x21001e00, 0x24000001 },
{ 0x21001e01, 0x1400ffff },
{ 0x21001e02, 0x24000001 },
@@ -1967,7 +2031,7 @@ static cnode ucp_table[] = {
{ 0x1b8020dd, 0x2c000003 },
{ 0x1b0020e1, 0x30000000 },
{ 0x1b8020e2, 0x2c000002 },
{ 0x1b8020e5, 0x30000006 },
{ 0x1b8020e5, 0x3000000a },
{ 0x09802100, 0x68000001 },
{ 0x09002102, 0x24000000 },
{ 0x09802103, 0x68000003 },
@@ -1995,7 +2059,7 @@ static cnode ucp_table[] = {
{ 0x0900212e, 0x68000000 },
{ 0x0900212f, 0x14000000 },
{ 0x09802130, 0x24000001 },
{ 0x09002132, 0x68000000 },
{ 0x21002132, 0x2400001c },
{ 0x09002133, 0x24000000 },
{ 0x09002134, 0x14000000 },
{ 0x09802135, 0x1c000003 },
@@ -2008,7 +2072,8 @@ static cnode ucp_table[] = {
{ 0x09802146, 0x14000003 },
{ 0x0900214a, 0x68000000 },
{ 0x0900214b, 0x64000000 },
{ 0x0900214c, 0x68000000 },
{ 0x0980214c, 0x68000001 },
{ 0x2100214e, 0x1400ffe4 },
{ 0x09802153, 0x3c00000c },
{ 0x09002160, 0x38000010 },
{ 0x09002161, 0x38000010 },
@@ -2042,7 +2107,9 @@ static cnode ucp_table[] = {
{ 0x0900217d, 0x3800fff0 },
{ 0x0900217e, 0x3800fff0 },
{ 0x0900217f, 0x3800fff0 },
{ 0x09802180, 0x38000003 },
{ 0x09802180, 0x38000002 },
{ 0x09002183, 0x24000001 },
{ 0x21002184, 0x1400ffff },
{ 0x09802190, 0x64000004 },
{ 0x09802195, 0x68000004 },
{ 0x0980219a, 0x64000001 },
@@ -2073,10 +2140,9 @@ static cnode ucp_table[] = {
{ 0x0900237c, 0x64000000 },
{ 0x0980237d, 0x6800001d },
{ 0x0980239b, 0x64000018 },
{ 0x090023b4, 0x58000000 },
{ 0x090023b5, 0x48000000 },
{ 0x090023b6, 0x54000000 },
{ 0x098023b7, 0x68000024 },
{ 0x098023b4, 0x68000027 },
{ 0x098023dc, 0x64000005 },
{ 0x098023e2, 0x68000005 },
{ 0x09802400, 0x68000026 },
{ 0x09802440, 0x6800000a },
{ 0x09802460, 0x3c00003b },
@@ -2143,7 +2209,7 @@ static cnode ucp_table[] = {
{ 0x09802600, 0x6800006e },
{ 0x0900266f, 0x64000000 },
{ 0x09802670, 0x6800002c },
{ 0x098026a0, 0x68000011 },
{ 0x098026a0, 0x68000012 },
{ 0x09802701, 0x68000003 },
{ 0x09802706, 0x68000003 },
{ 0x0980270c, 0x6800001b },
@@ -2174,6 +2240,7 @@ static cnode ucp_table[] = {
{ 0x098027c0, 0x64000004 },
{ 0x090027c5, 0x58000000 },
{ 0x090027c6, 0x48000000 },
{ 0x098027c7, 0x64000003 },
{ 0x098027d0, 0x64000015 },
{ 0x090027e6, 0x58000000 },
{ 0x090027e7, 0x48000000 },
@@ -2215,7 +2282,8 @@ static cnode ucp_table[] = {
{ 0x090029fc, 0x58000000 },
{ 0x090029fd, 0x48000000 },
{ 0x098029fe, 0x64000101 },
{ 0x09802b00, 0x68000013 },
{ 0x09802b00, 0x6800001a },
{ 0x09802b20, 0x68000003 },
{ 0x11002c00, 0x24000030 },
{ 0x11002c01, 0x24000030 },
{ 0x11002c02, 0x24000030 },
@@ -2310,6 +2378,23 @@ static cnode ucp_table[] = {
{ 0x11002c5c, 0x1400ffd0 },
{ 0x11002c5d, 0x1400ffd0 },
{ 0x11002c5e, 0x1400ffd0 },
{ 0x21002c60, 0x24000001 },
{ 0x21002c61, 0x1400ffff },
{ 0x21002c62, 0x2400d609 },
{ 0x21002c63, 0x2400f11a },
{ 0x21002c64, 0x2400d619 },
{ 0x21002c65, 0x1400d5d5 },
{ 0x21002c66, 0x1400d5d8 },
{ 0x21002c67, 0x24000001 },
{ 0x21002c68, 0x1400ffff },
{ 0x21002c69, 0x24000001 },
{ 0x21002c6a, 0x1400ffff },
{ 0x21002c6b, 0x24000001 },
{ 0x21002c6c, 0x1400ffff },
{ 0x21002c74, 0x14000000 },
{ 0x21002c75, 0x24000001 },
{ 0x21002c76, 0x1400ffff },
{ 0x21002c77, 0x14000000 },
{ 0x0a002c80, 0x24000001 },
{ 0x0a002c81, 0x1400ffff },
{ 0x0a002c82, 0x24000001 },
@@ -2559,6 +2644,8 @@ static cnode ucp_table[] = {
{ 0x3c80a016, 0x1c000476 },
{ 0x3c80a490, 0x68000036 },
{ 0x0980a700, 0x60000016 },
{ 0x0980a717, 0x18000003 },
{ 0x0980a720, 0x60000001 },
{ 0x3080a800, 0x1c000001 },
{ 0x3000a802, 0x28000000 },
{ 0x3080a803, 0x1c000002 },
@@ -2570,6 +2657,8 @@ static cnode ucp_table[] = {
{ 0x3080a825, 0x30000001 },
{ 0x3000a827, 0x28000000 },
{ 0x3080a828, 0x68000003 },
{ 0x4080a840, 0x1c000033 },
{ 0x4080a874, 0x54000003 },
{ 0x1780ac00, 0x1c002ba3 },
{ 0x0980d800, 0x1000037f },
{ 0x0980db80, 0x1000007f },
@@ -2765,13 +2854,15 @@ static cnode ucp_table[] = {
{ 0x1301018a, 0x3c000000 },
{ 0x29810300, 0x1c00001e },
{ 0x29810320, 0x3c000003 },
{ 0x12810330, 0x1c000019 },
{ 0x12810330, 0x1c000010 },
{ 0x12010341, 0x38000000 },
{ 0x12810342, 0x1c000007 },
{ 0x1201034a, 0x38000000 },
{ 0x3b810380, 0x1c00001d },
{ 0x3b01039f, 0x54000000 },
{ 0x2a8103a0, 0x1c000023 },
{ 0x2a8103c8, 0x1c000007 },
{ 0x2a0103d0, 0x68000000 },
{ 0x2a0103d0, 0x54000000 },
{ 0x2a8103d1, 0x38000004 },
{ 0x0d010400, 0x24000028 },
{ 0x0d010401, 0x24000028 },
@@ -2861,6 +2952,9 @@ static cnode ucp_table[] = {
{ 0x0b810837, 0x1c000001 },
{ 0x0b01083c, 0x1c000000 },
{ 0x0b01083f, 0x1c000000 },
{ 0x41810900, 0x1c000015 },
{ 0x41810916, 0x3c000003 },
{ 0x4101091f, 0x54000000 },
{ 0x1e010a00, 0x1c000000 },
{ 0x1e810a01, 0x30000002 },
{ 0x1e810a05, 0x30000001 },
@@ -2872,6 +2966,9 @@ static cnode ucp_table[] = {
{ 0x1e010a3f, 0x30000000 },
{ 0x1e810a40, 0x3c000007 },
{ 0x1e810a50, 0x54000008 },
{ 0x3e812000, 0x1c00036e },
{ 0x3e812400, 0x38000062 },
{ 0x3e812470, 0x54000003 },
{ 0x0981d000, 0x680000f5 },
{ 0x0981d100, 0x68000026 },
{ 0x0981d12a, 0x6800003a },
@@ -2890,6 +2987,7 @@ static cnode ucp_table[] = {
{ 0x1381d242, 0x30000002 },
{ 0x1301d245, 0x68000000 },
{ 0x0981d300, 0x68000056 },
{ 0x0981d360, 0x3c000011 },
{ 0x0981d400, 0x24000019 },
{ 0x0981d41a, 0x14000019 },
{ 0x0981d434, 0x24000019 },
@@ -2957,6 +3055,8 @@ static cnode ucp_table[] = {
{ 0x0981d7aa, 0x14000018 },
{ 0x0901d7c3, 0x64000000 },
{ 0x0981d7c4, 0x14000005 },
{ 0x0901d7ca, 0x24000000 },
{ 0x0901d7cb, 0x14000000 },
{ 0x0981d7ce, 0x34000031 },
{ 0x16820000, 0x1c00a6d6 },
{ 0x1682f800, 0x1c00021d },