1
0
mirror of https://github.com/php/php-src.git synced 2026-04-25 08:58:28 +02:00

Upgrade PCRE library to 5.0.

This commit is contained in:
Andrei Zmievski
2005-05-27 18:07:33 +00:00
parent 8ce349b8e0
commit 1d019347cd
30 changed files with 20350 additions and 2095 deletions
+1
View File
@@ -1,6 +1,7 @@
PHP NEWS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
?? ??? 2005, PHP 5.0.5
- Upgraded PCRE library to version 5.0. (Andrei)
- Removed php_check_syntax() function which never worked properly. (Ilia)
- Added new function mysqli_set_charset(). (Georg)
- Added man pages for "phpize" and "php-config" scripts. (Jakub Vrana)
+2 -2
View File
@@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,
if test "$PHP_PCRE_REGEX" != "no"; then
if test "$PHP_PCRE_REGEX" = "yes"; then
PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -I@ext_srcdir@/pcrelib)
PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -I@ext_srcdir@/pcrelib)
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
else
@@ -50,7 +50,7 @@ if test "$PHP_PCRE_REGEX" != "no"; then
AC_DEFINE(HAVE_PCRE, 1, [ ])
PHP_ADD_INCLUDE($PCRE_INCDIR)
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10)
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000)
fi
PHP_SUBST(PCRE_SHARED_LIBADD)
fi
+1 -1
View File
@@ -3,4 +3,4 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2003 University of Cambridge
Copyright (c) 1997-2004 University of Cambridge
+27 -36
View File
@@ -4,51 +4,42 @@ PCRE LICENCE
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
Written by: Philip Hazel <ph10@cam.ac.uk>
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2003 University of Cambridge
Copyright (c) 1997-2004 University of Cambridge
All rights reserved.
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. In practice, this means that if you use
PCRE in software that you distribute to others, commercially or
otherwise, you must put a sentence like this
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
Regular expression support is provided by the PCRE library package,
which is open source software, written by Philip Hazel, and copyright
by the University of Cambridge, England.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
somewhere reasonably visible in your documentation and in any relevant
files or online help data or similar. A reference to the ftp site for
the source, that is, to
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
should also be given in the documentation. However, this condition is not
intended to apply to whole chains of software. If package A includes PCRE,
it must acknowledge it, but if package B is software that includes package
A, the condition is not imposed on package B (unless it uses PCRE
independently).
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
then the terms of that licence shall supersede any condition above with
which it is incompatible.
The documentation for PCRE, supplied in the "doc" directory, is distributed
under the same terms as the software itself.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
End
+136
View File
@@ -1,6 +1,142 @@
ChangeLog for PCRE
------------------
Version 5.0 13-Sep-04
---------------------
1. Internal change: literal characters are no longer packed up into items
containing multiple characters in a single byte-string. Each character
is now matched using a separate opcode. However, there may be more than one
byte in the character in UTF-8 mode.
2. The pcre_callout_block structure has two new fields: pattern_position and
next_item_length. These contain the offset in the pattern to the next match
item, and its length, respectively.
3. The PCRE_AUTO_CALLOUT option for pcre_compile() requests the automatic
insertion of callouts before each pattern item. Added the /C option to
pcretest to make use of this.
4. On the advice of a Windows user, the lines
#if defined(_WIN32) || defined(WIN32)
_setmode( _fileno( stdout ), 0x8000 );
#endif /* defined(_WIN32) || defined(WIN32) */
have been added to the source of pcretest. This apparently does useful
magic in relation to line terminators.
5. Changed "r" and "w" in the calls to fopen() in pcretest to "rb" and "wb"
for the benefit of those environments where the "b" makes a difference.
6. The icc compiler has the same options as gcc, but "configure" doesn't seem
to know about it. I have put a hack into configure.in that adds in code
to set GCC=yes if CC=icc. This seems to end up at a point in the
generated configure script that is early enough to affect the setting of
compiler options, which is what is needed, but I have no means of testing
whether it really works. (The user who reported this had patched the
generated configure script, which of course I cannot do.)
LATER: After change 22 below (new libtool files), the configure script
seems to know about icc (and also ecc). Therefore, I have commented out
this hack in configure.in.
7. Added support for pkg-config (2 patches were sent in).
8. Negated POSIX character classes that used a combination of internal tables
were completely broken. These were [[:^alpha:]], [[:^alnum:]], and
[[:^ascii]]. Typically, they would match almost any characters. The other
POSIX classes were not broken in this way.
9. Matching the pattern "\b.*?" against "ab cd", starting at offset 1, failed
to find the match, as PCRE was deluded into thinking that the match had to
start at the start point or following a newline. The same bug applied to
patterns with negative forward assertions or any backward assertions
preceding ".*" at the start, unless the pattern required a fixed first
character. This was a failing pattern: "(?!.bcd).*". The bug is now fixed.
10. In UTF-8 mode, when moving forwards in the subject after a failed match
starting at the last subject character, bytes beyond the end of the subject
string were read.
11. Renamed the variable "class" as "classbits" to make life easier for C++
users. (Previously there was a macro definition, but it apparently wasn't
enough.)
12. Added the new field "tables" to the extra data so that tables can be passed
in at exec time, or the internal tables can be re-selected. This allows
a compiled regex to be saved and re-used at a later time by a different
program that might have everything at different addresses.
13. Modified the pcre-config script so that, when run on Solaris, it shows a
-R library as well as a -L library.
14. The debugging options of pcretest (-d on the command line or D on a
pattern) showed incorrect output for anything following an extended class
that contained multibyte characters and which was followed by a quantifier.
15. Added optional support for general category Unicode character properties
via the \p, \P, and \X escapes. Unicode property support implies UTF-8
support. It adds about 90K to the size of the library. The meanings of the
inbuilt class escapes such as \d and \s have NOT been changed.
16. Updated pcredemo.c to include calls to free() to release the memory for the
compiled pattern.
17. The generated file chartables.c was being created in the source directory
instead of in the building directory. This caused the build to fail if the
source directory was different from the building directory, and was
read-only.
18. Added some sample Win commands from Mark Tetrode into the NON-UNIX-USE
file. No doubt somebody will tell me if they don't make sense... Also added
Dan Mooney's comments about building on OpenVMS.
19. Added support for partial matching via the PCRE_PARTIAL option for
pcre_exec() and the \P data escape in pcretest.
20. Extended pcretest with 3 new pattern features:
(i) A pattern option of the form ">rest-of-line" causes pcretest to
write the compiled pattern to the file whose name is "rest-of-line".
This is a straight binary dump of the data, with the saved pointer to
the character tables forced to be NULL. The study data, if any, is
written too. After writing, pcretest reads a new pattern.
(ii) If, instead of a pattern, "<rest-of-line" is given, pcretest reads a
compiled pattern from the given file. There must not be any
occurrences of "<" in the file name (pretty unlikely); if there are,
pcretest will instead treat the initial "<" as a pattern delimiter.
After reading in the pattern, pcretest goes on to read data lines as
usual.
(iii) The F pattern option causes pcretest to flip the bytes in the 32-bit
and 16-bit fields in a compiled pattern, to simulate a pattern that
was compiled on a host of opposite endianness.
21. The pcre-exec() function can now cope with patterns that were compiled on
hosts of opposite endianness, with this restriction:
As for any compiled expression that is saved and used later, the tables
pointer field cannot be preserved; the extra_data field in the arguments
to pcre_exec() should be used to pass in a tables address if a value
other than the default internal tables were used at compile time.
22. Calling pcre_exec() with a negative value of the "ovecsize" parameter is
now diagnosed as an error. Previously, most of the time, a negative number
would have been treated as zero, but if in addition "ovector" was passed as
NULL, a crash could occur.
23. Updated the files ltmain.sh, config.sub, config.guess, and aclocal.m4 with
new versions from the libtool 1.5 distribution (the last one is a copy of
a file called libtool.m4). This seems to have fixed the need to patch
"configure" to support Darwin 1.3 (which I used to do). However, I still
had to patch ltmain.sh to ensure that ${SED} is set (it isn't on my
workstation).
24. Changed the PCRE licence to be the more standard "BSD" licence.
Version 4.5 01-Dec-03
---------------------
+27 -36
View File
@@ -4,51 +4,42 @@ PCRE LICENCE
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
Written by: Philip Hazel <ph10@cam.ac.uk>
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2003 University of Cambridge
Copyright (c) 1997-2004 University of Cambridge
All rights reserved.
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. In practice, this means that if you use
PCRE in software that you distribute to others, commercially or
otherwise, you must put a sentence like this
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
Regular expression support is provided by the PCRE library package,
which is open source software, written by Philip Hazel, and copyright
by the University of Cambridge, England.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
somewhere reasonably visible in your documentation and in any relevant
files or online help data or similar. A reference to the ftp site for
the source, that is, to
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
should also be given in the documentation. However, this condition is not
intended to apply to whole chains of software. If package A includes PCRE,
it must acknowledge it, but if package B is software that includes package
A, the condition is not imposed on package B (unless it uses PCRE
independently).
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
then the terms of that licence shall supersede any condition above with
which it is incompatible.
The documentation for PCRE, supplied in the "doc" directory, is distributed
under the same terms as the software itself.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
End
+47
View File
@@ -1,6 +1,53 @@
News about PCRE releases
------------------------
Release 5.0 13-Sep-04
---------------------
The licence under which PCRE is released has been changed to the more
conventional "BSD" licence.
In the code, some bugs have been fixed, and there are also some major changes
in this release (which is why I've increased the number to 5.0). Some changes
are internal rearrangements, and some provide a number of new facilities. The
new features are:
1. There's an "automatic callout" feature that inserts callouts before every
item in the regex, and there's a new callout field that gives the position
in the pattern - useful for debugging and tracing.
2. The extra_data structure can now be used to pass in a set of character
tables at exec time. This is useful if compiled regex are saved and re-used
at a later time when the tables may not be at the same address. If the
default internal tables are used, the pointer saved with the compiled
pattern is now set to NULL, which means that you don't need to do anything
special unless you are using custom tables.
3. It is possible, with some restrictions on the content of the regex, to
request "partial" matching. A special return code is given if all of the
subject string matched part of the regex. This could be useful for testing
an input field as it is being typed.
4. There is now some optional support for Unicode character properties, which
means that the patterns items such as \p{Lu} and \X can now be used. Only
the general category properties are supported. If PCRE is compiled with this
support, an additional 90K data structure is include, which increases the
size of the library dramatically.
5. There is support for saving compiled patterns and re-using them later.
6. There is support for running regular expressions that were compiled on a
different host with the opposite endianness.
7. The pcretest program has been extended to accommodate the new features.
The main internal rearrangement is that sequences of literal characters are no
longer handled as strings. Instead, each character is handled on its own. This
makes some UTF-8 handling easier, and makes the support of partial matching
possible. Compiled patterns containing long literal strings will be larger as a
result of this change; I hope that performance will not be much affected.
Release 4.5 01-Dec-03
---------------------
+132 -10
View File
@@ -1,19 +1,25 @@
Compiling PCRE on non-Unix systems
----------------------------------
See below for comments on Cygwin or MinGW usage. I (Philip Hazel) have no
knowledge of Windows sytems and how their libraries work. The items in the
PCRE Makefile that relate to anything other than Unix-like systems have been
contributed by PCRE users. There are some other comments and files in the
Contrib directory on the ftp site that you may find useful.
See below for comments on Cygwin or MinGW and OpenVMS usage. I (Philip Hazel)
have no knowledge of Windows or VMS sytems and how their libraries work. The
items in the PCRE Makefile that relate to anything other than Unix-like systems
have been contributed by PCRE users. There are some other comments and files in
the Contrib directory on the ftp site that you may find useful. See
The following are generic comments about building PCRE:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
If you want to compile PCRE for a non-Unix system (or perhaps, more strictly,
for a system that does not support "configure" and make files), note that PCRE
consists entirely of code written in Standard C, and so should compile
successfully on any machine with a Standard C compiler and library, using
normal compiling commands to do the following:
for a system that does not support "configure" and "make" files), note that
PCRE consists entirely of code written in Standard C, and so should compile
successfully on any system that has a Standard C compiler and library.
GENERIC INSTRUCTIONS
The following are generic comments about building PCRE. The interspersed
indented commands are suggestions from Mark Tetrode as to which commands you
might use on a Windows system to build a static library.
(1) Copy or rename the file config.in as config.h, and change the macros that
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
@@ -23,32 +29,85 @@ particular, if you want to force a specific value for newline, you can define
the NEWLINE macro. The default is to use '\n', thereby using whatever value
your compiler gives to '\n'.
rem Mark Tetrode's commands
copy config.in config.h
rem Use write, because notepad cannot handle UNIX files. Change values.
write config.h
(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
configure.in.
rem Mark Tetrode's commands
copy pcre.in pcre.h
rem Read values from configure.in
write configure.in
rem Change values
write pcre.h
(3) Compile dftables.c as a stand-alone program, and then run it with
the single argument "chartables.c". This generates a set of standard
character tables and writes them to that file.
rem Mark Tetrode's commands
rem Compile & run
cl -DSUPPORT_UTF8 dftables.c
dftables.exe > chartables.c
(4) Compile maketables.c, get.c, study.c and pcre.c and link them all
together into an object library in whichever form your system keeps such
libraries. This is the pcre library (chartables.c is included by means of an
#include directive). If your system has static and shared libraries, you may
have to do this once for each type.
rem Mark Tetrode's commands, for a static library
rem Compile & lib
cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c
lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj
(5) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix
library.
rem Mark Tetrode's commands, for a static library
rem Compile & lib
cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c
lib /OUT:pcreposix.lib pcreposix.obj
(6) Compile the test program pcretest.c. This needs the functions in the
pcre and pcreposix libraries when linking.
rem Mark Tetrode's commands
rem compile & link
cl pcretest.c pcre.lib pcreposix.lib
(7) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. You must use the
-i option when checking testinput2. Note that the supplied files are in Unix
format, with just LF characters as line terminators. You may need to edit them
to change this if your system uses a different convention.
rem Mark Tetrode's commands
rem Make a change, i.e. space, backspace, and save again - do this for all
rem to change UNIX to Win, \n to \n\r
write testoutput1
write testoutput2
write testoutput3
write testoutput4
write testoutput5
pcretest testdata\testinput1 testdata\myoutput1
windiff testdata\testoutput1 testdata\myoutput1
pcretest -i testdata\testinput2 testdata\myoutput2
windiff testdata\testoutput2 testdata\myoutput2
pcretest testdata\testinput3 testdata\myoutput3
windiff testdata\testoutput3 testdata\myoutput3
pcretest testdata\testinput4 testdata\myoutput4
windiff testdata\testoutput4 testdata\myoutput4
pcretest testdata\testinput5 testdata\myoutput5
windiff testdata\testoutput5 testdata\myoutput5
FURTHER REMARKS
If you have a system without "configure" but where you can use a Makefile, edit
Makefile.in to create Makefile, substituting suitable values for the variables
at the head of the file.
@@ -119,4 +178,67 @@ void (*pcre_free)(void *) = free;
#endif
=========================
BUILDING PCRE ON OPENVMS
Dan Mooney sent the following comments about building PCRE on OpenVMS:
"It was quite easy to compile and link the library. I don't have a formal
make file but the attached file [reproduced below] contains the OpenVMS DCL
commands I used to build the library. I had to add #define
POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
The library was built on:
O/S: HP OpenVMS v7.3-1
Compiler: Compaq C v6.5-001-48BCD
Linker: vA13-01
The test results did not match 100% due to the issues you mention in your
documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
modified some of the character tables temporarily and was able to get the
results to match. Tests using the fr locale did not match since I don't have
that locale loaded. The study size was always reported to be 3 less than the
value in the standard test output files."
=========================
$! This DCL procedure builds PCRE on OpenVMS
$!
$! I followed the instructions in the non-unix-use file in the distribution.
$!
$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
$ COMPILE DFTABLES.C
$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
$ COMPILE MAKETABLES.C
$ COMPILE GET.C
$ COMPILE STUDY.C
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
$ COMPILE PCRE.C
$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$ COMPILE PCREPOSIX.C
$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
$ COMPILE PCRETEST.C
$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
$! C programs that want access to command line arguments must be
$! defined as a symbol
$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
$! Arguments must be enclosed in quotes.
$ PCRETEST "-C"
$! Test results:
$!
$! The test results did not match 100%. The functions isprint(), iscntrl(),
$! isgraph() and ispunct() on OpenVMS must not produce the same results
$! as the system that built the test output files provided with the
$! distribution.
$!
$! The study size did not match and was always 3 less on OpenVMS.
$!
$! Locale could not be set to fr
$!
=========================
****
+82 -20
View File
@@ -22,6 +22,28 @@ ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
up the "real" POSIX functions of the same name.
Documentation for PCRE
----------------------
If you install PCRE in the normal way, you will end up with an installed set of
man pages whose names all start with "pcre". The one that is called "pcre"
lists all the others. In addition to these man pages, the PCRE documentation is
supplied in two other forms; however, as there is no standard place to install
them, they are left in the doc directory of the unpacked source distribution.
These forms are:
1. Files called doc/pcre.txt, doc/pcregrep.txt, and doc/pcretest.txt. The
first of these is a concatenation of the text forms of all the section 3
man pages except those that summarize individual functions. The other two
are the text forms of the section 1 man pages for the pcregrep and
pcretest commands. Text forms are provided for ease of scanning with text
editors or similar tools.
2. A subdirectory called doc/html contains all the documentation in HTML
form, hyperlinked in various ways, and rooted in a file called
doc/index.html.
Contributions by users of PCRE
------------------------------
@@ -46,7 +68,7 @@ INSTALL.
Most commonly, people build PCRE within its own distribution directory, and in
this case, on many systems, just running "./configure" is sufficient, but the
usual methods of changing standard defaults are available. For example,
usual methods of changing standard defaults are available. For example:
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
@@ -69,6 +91,13 @@ library. You can read more about them in the pcrebuild man page.
for handling UTF-8 is not included in the library. (Even when included, it
still has to be enabled by an option at run time.)
. If, in addition to support for UTF-8 character strings, you want to include
support for the \P, \p, and \X sequences that recognize Unicode character
properties, you must add --enable-unicode-properties to the "configure"
command. This adds about 90K to the size of the library (in the form of a
property table); only the basic two-letter properties such as Lu are
supported.
. You can build PCRE to recognized CR or NL as the newline character, instead
of whatever your compiler uses for "\n", by adding --newline-is-cr or
--newline-is-nl to the "configure" command, respectively. Only do this if you
@@ -111,12 +140,14 @@ library. You can read more about them in the pcrebuild man page.
on the "configure" command. PCRE runs more slowly in this mode, but it may be
necessary in environments with limited stack sizes.
The "configure" script builds five files:
The "configure" script builds seven files:
. libtool is a script that builds shared and/or static libraries
. pcre.h is build by copying pcre.in and making substitutions
. Makefile is built by copying Makefile.in and making substitutions.
. config.h is built by copying config.in and making substitutions.
. pcre-config is built by copying pcre-config.in and making substitutions.
. libpcre.pc is data for the pkg-config command, built from libpcre.pc.in
. libtool is a script that builds shared and/or static libraries
. RunTest is a script for running tests
Once "configure" has run, you can run "make". It builds two libraries called
@@ -125,20 +156,33 @@ command. You can use "make install" to copy these, the public header files
pcre.h and pcreposix.h, and the man pages to appropriate live directories on
your system, in the normal way.
Retrieving configuration information on Unix-like systems
---------------------------------------------------------
Running "make install" also installs the command pcre-config, which can be used
to recall information about the PCRE configuration and installation. For
example,
example:
pcre-config --version
prints the version number, and
pcre-config --libs
pcre-config --libs
outputs information about where the library is installed. This command can be
included in makefiles for programs that use PCRE, saving the programmer from
having to remember too many details.
The pkg-config command is another system for saving and retrieving information
about installed libraries. Instead of separate commands for each library, a
single command is used. For example:
pkg-config --cflags pcre
The data is held in *.pc files that are installed in a directory called
pkgconfig.
Shared libraries on Unix-like systems
-------------------------------------
@@ -158,7 +202,7 @@ installed themselves. However, the versions left in the source directory still
use the uninstalled libraries.
To build PCRE using static libraries only you must use --disable-shared when
configuring it. For example
configuring it. For example:
./configure --prefix=/usr/gnu --disable-shared
@@ -202,9 +246,9 @@ configuring process. (This can also be run by "make runtest", "make check", or
The script runs the pcretest test program (which is documented in its own man
page) on each of the testinput files (in the testdata directory) in turn,
and compares the output with the contents of the corresponding testoutput file.
A file called testtry is used to hold the output from pcretest. To run pcretest
on just one of the test files, give its number as an argument to RunTest, for
example:
A file called testtry is used to hold the main output from pcretest
(testsavedregex is also used as a working file). To run pcretest on just one of
the test files, give its number as an argument to RunTest, for example:
RunTest 2
@@ -247,19 +291,23 @@ running "configure". This file can be also fed directly to the perltest script,
provided you are running Perl 5.8 or higher. (For Perl 5.6, a small patch,
commented in the script, can be be used.)
The fifth and final file tests error handling with UTF-8 encoding, and internal
UTF-8 features of PCRE that are not relevant to Perl.
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
features of PCRE that are not relevant to Perl.
The sixth and final test checks the support for Unicode character properties.
It it not run automatically unless PCRE is built with Unicode property support.
To to this you must set --enable-unicode-properties when running "configure".
Character tables
----------------
PCRE uses four tables for manipulating and identifying characters. The final
argument of the pcre_compile() function is a pointer to a block of memory
containing the concatenated tables. A call to pcre_maketables() can be used to
generate a set of tables in the current locale. If the final argument for
pcre_compile() is passed as NULL, a set of default tables that is built into
the binary is used.
PCRE uses four tables for manipulating and identifying characters whose values
are less than 256. The final argument of the pcre_compile() function is a
pointer to a block of memory containing the concatenated tables. A call to
pcre_maketables() can be used to generate a set of tables in the current
locale. If the final argument for pcre_compile() is passed as NULL, a set of
default tables that is built into the binary is used.
The source file called chartables.c contains the default set of tables. This is
not supplied in the distribution, but is built by the program dftables
@@ -299,12 +347,20 @@ The distribution should contain the following files:
headers:
dftables.c auxiliary program for building chartables.c
get.c )
maketables.c )
study.c ) source of
pcre.c ) the functions
study.c ) source of the functions
pcre.c ) in the library
pcreposix.c )
printint.c )
ucp.c )
ucp.h ) source for the code that is used for
ucpinternal.h ) Unicode property handling
ucptable.c )
ucptypetable.c )
pcre.in "source" for the header for the external API; pcre.h
is built from this by "configure"
pcreposix.h header for the external POSIX wrapper API
@@ -335,7 +391,9 @@ The distribution should contain the following files:
doc/pcretest.txt plain text documentation of test program
doc/perltest.txt plain text documentation of Perl test program
install-sh a shell script for installing files
libpcre.pc.in "source" for libpcre.pc for pkg-config
ltmain.sh file used to build a libtool script
mkinstalldirs script for making install directories
pcretest.c comprehensive test program
pcredemo.c simple demonstration of coding calls to PCRE
perltest Perl test program
@@ -346,15 +404,19 @@ The distribution should contain the following files:
testdata/testinput3 test data for locale-specific tests
testdata/testinput4 test data for UTF-8 tests compatible with Perl
testdata/testinput5 test data for other UTF-8 tests
testdata/testinput6 test data for Unicode property support tests
testdata/testoutput1 test results corresponding to testinput1
testdata/testoutput2 test results corresponding to testinput2
testdata/testoutput3 test results corresponding to testinput3
testdata/testoutput4 test results corresponding to testinput4
testdata/testoutput5 test results corresponding to testinput5
testdata/testoutput6 test results corresponding to testinput6
(C) Auxiliary files for Win32 DLL
dll.mk
libpcre.def
libpcreposix.def
pcre.def
(D) Auxiliary file for VPASCAL
@@ -362,4 +424,4 @@ The distribution should contain the following files:
makevp.bat
Philip Hazel <ph10@cam.ac.uk>
December 2003
September 2004
+21 -15
View File
@@ -11,26 +11,32 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
See the file Tech.Notes for some information on the internals.
*/
+67 -33
View File
@@ -1,6 +1,9 @@
Technical Notes about PCRE
--------------------------
Historical note 1
-----------------
Many years ago I implemented some regular expression functions to an algorithm
suggested by Martin Richards. These were not Unix-like in form, and were quite
restricted in what they could do by comparison with Perl. The interesting part
@@ -9,12 +12,15 @@ form of an expression was known in advance. The code to apply an expression did
not operate by backtracking, as the original Henry Spencer code and current
Perl code does, but instead checked all possibilities simultaneously by keeping
a list of current states and checking all of them as it advanced through the
subject string. (In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm".) When the pattern was all used up, all remaining states were
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm". When the pattern was all used up, all remaining states were
possible matches, and the one matching the longest subset of the subject string
was chosen. This did not necessarily maximize the individual wild portions of
the pattern, as is expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
By contrast, the code originally written by Henry Spencer and subsequently
heavily modified for Perl actually compiles the expression twice: once in a
dummy mode in order to find out how much store will be needed, and then for
@@ -23,6 +29,9 @@ optionally, minimizing in Perl) the amount of the subject that matches
individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
terminology.
OK, here's the real stuff
-------------------------
For the set of functions that forms PCRE (which are unrelated to those
mentioned above), I tried at first to invent an algorithm that used an amount
of store bounded by a multiple of the number of characters in the pattern, to
@@ -38,8 +47,16 @@ got quite big anyway to handle all the Perl stuff.
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes which
follow it. A list of all the opcodes follows:
item is either implicit in the opcode or contained in the data bytes that
follow it.
In many cases below "two-byte" data values are specified. This is in fact just
a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options.
A list of all the opcodes follows:
Opcodes with no following data
------------------------------
@@ -48,7 +65,7 @@ These items are all just one byte long
OP_END end of pattern
OP_ANY match any character
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
OP_CIRC ^ (start of data, or after \n in multiline)
@@ -63,13 +80,14 @@ These items are all just one byte long
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_EXTUNI match an extended Unicode character
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character appear as
two-byte items using the following opcodes:
The common repeats (*, +, ?) when applied to a single character use the
following opcodes:
OP_STAR
OP_MINSTAR
@@ -78,6 +96,7 @@ two-byte items using the following opcodes:
OP_QUERY
OP_MINQUERY
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Each is followed by
the character that is to be repeated. Other repeats make use of
@@ -109,39 +128,52 @@ byte. The opcodes are:
OP_TYPEEXACT
Matching a character string
Match by Unicode property
-------------------------
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by a single byte that encodes the desired property value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by two
bytes: OP_PROP or OP_NOTPROP and then the desired property value.
Matching literal characters
---------------------------
The OP_CHARS opcode is followed by a one-byte count and then that number of
characters. If there are more than 255 characters in sequence, successive
instances of OP_CHARS are used.
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
character may be more than one byte long. (Earlier versions of PCRE used
multi-character strings, but this was changed to allow some new features to be
added.)
Character classes
-----------------
If there is only one character, OP_CHARS is used for a positive class,
and OP_NOT for a negative one (that is, for something like [^a]). However, in
UTF-8 mode, this applies only to characters with values < 128, because OP_NOT
is confined to single bytes.
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
class, and OP_NOT for a negative one (that is, for something like [^a]).
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
values < 128, because OP_NOT is confined to single bytes.
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
repeated positive single-character class.
When there's more than one character in a class and all the characters are less
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
bit for every character that is acceptable. The bits are counted from the least
significant end of each byte.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
OP_CLASS they don't match, whereas for OP_NCLASS they do.
For classes containing characters with values > 255, OP_XCLASS is used. It
optionally uses a bit map (if any characters lie within it), followed by a list
of pairs and single characters. There is a flag character than indicates
of pairs and single characters. There is a flag character than indicates
whether it's a positive or a negative class.
@@ -192,14 +224,14 @@ the bracket itself. (They could have all been done like this, but I was making
minimal changes.)
A bracket opcode is followed by two bytes which give the offset to the next
alternative OP_ALT or, if there aren't any branches, to the matching KET
alternative OP_ALT or, if there aren't any branches, to the matching OP_KET
opcode. Each OP_ALT is followed by two bytes giving the offset to the next one,
or to the KET opcode.
or to the OP_KET opcode.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by two bytes giving (as a
positive number) the offset back to the matching BRA opcode.
positive number) the offset back to the matching OP_BRA opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
@@ -207,15 +239,14 @@ opcodes which tell the matcher that skipping this subpattern entirely is a
valid branch.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with a BRAZERO if the
minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
appropriate.
compiled data its minimum number of times (or once with OP_BRAZERO if the
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
as appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 99 and 200 bracket limits do
not apply to these internally generated brackets.
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
before each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
Assertions
@@ -260,8 +291,11 @@ from the start of the whole pattern.
Callout
-------
OP_CALLOUT is followed by one byte of data that holds a callout number in the
range 0 to 255.
OP_CALLOUT is followed by one byte of data that holds a callout number in the
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
cases there follows a two-byte value giving the offset in the pattern to the
start of the following item, and another two-byte item giving the length of the
next item.
Changing options
@@ -278,4 +312,4 @@ at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
August 2003
September 2004
File diff suppressed because it is too large Load Diff
+23 -15
View File
@@ -9,32 +9,40 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
Copyright (c) 1997-2003 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains some convenience functions for extracting substrings
from the subject string after a regex match has succeeded. The original idea
for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
for these functions came from Scott Wimer. */
/* Include the internals header, which itself includes Standard C headers plus
+221 -158
View File
@@ -5,30 +5,38 @@
/* This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
the file doc/Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2003 University of Cambridge
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
@@ -45,6 +53,18 @@ modules, but which are not relevant to the outside. */
# include <php_config.h>
#endif
/* Standard C headers plus the external interface definition. The only time
setjmp and stdarg are used is when NO_RECURSE is set. */
#include <ctype.h>
#include <limits.h>
#include <setjmp.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef PCRE_SPY
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
#endif
@@ -57,24 +77,45 @@ On Unix systems, "configure" can be used to override this default. */
#define NEWLINE '\n'
#endif
/* The value of MATCH_LIMIT determines the default number of times the match()
function can be called during a single execution of pcre_exec(). (There is a
runtime method of setting a different limit.) The limit exists in order to
catch runaway regular expressions that take for ever to determine that they do
not match. The default is set very large so that it does not accidentally catch
legitimate cases. On Unix systems, "configure" can be used to override this
default default. */
#ifndef MATCH_LIMIT
#define MATCH_LIMIT 10000000
#endif
/* If you are compiling for a system that needs some magic to be inserted
* before the definition of an exported function, define this macro to contain
* the relevant magic. It apears at the start of every exported function. */
#define EXPORT
/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
cannot determine these outside the compilation (e.g. by running a program as
part of "configure") because PCRE is often cross-compiled for use on other
systems. Instead we make use of the maximum sizes that are available at
preprocessor time in standard C environments. */
#if USHRT_MAX == 65535
typedef unsigned short pcre_uint16;
#elif UINT_MAX == 65535
typedef unsigned int pcre_uint16;
#else
#error Cannot determine a type for 16-bit unsigned integers
#endif
#if UINT_MAX == 4294967295
typedef unsigned int pcre_uint32;
#elif ULONG_MAX == 4294967295
typedef unsigned long int pcre_uint32;
#else
#error Cannot determine a type for 32-bit unsigned integers
#endif
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
However, we leave the interface to the outside world as char *, because that
should make things easier for callers. We define a short type for unsigned char
to save lots of typing. I tried "uchar", but it causes problems on Digital
Unix, where it is defined in sys/types, so use "uschar" instead. */
typedef unsigned char uschar;
/* Include the public PCRE header */
#include "pcre.h"
/* When compiling for use with the Virtual Pascal compiler, these functions
@@ -95,18 +136,6 @@ neither (there some non-Unix environments where this is the case). This assumes
that all calls to memmove are moving strings upwards in store, which is the
case in PCRE. */
/* Standard C headers plus the external interface definition. The only time
setjmp and stdarg are used is when NO_RECURSE is set. */
#include <ctype.h>
#include <limits.h>
#include <setjmp.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if ! HAVE_MEMMOVE
#undef memmove /* some systems may have a macro */
#if HAVE_BCOPY
@@ -126,13 +155,14 @@ for (i = 0; i < n; ++i) *(--dest) = *(--src);
#endif /* not VPCOMPAT */
/* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
These are used, for example, to link from the start of a subpattern to its
alternatives and its end. The use of 2 bytes per offset limits the size of the
compiled regex to around 64K, which is big enough for almost everybody.
However, I received a request for an even bigger limit. For this reason, and
also to make the code easier to maintain, the storing and loading of offsets
from the byte string is now handled by the macros that are defined here.
/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
in big-endian order) by default. These are used, for example, to link from the
start of a subpattern to its alternatives and its end. The use of 2 bytes per
offset limits the size of the compiled regex to around 64K, which is big enough
for almost everybody. However, I received a request for an even bigger limit.
For this reason, and also to make the code easier to maintain, the storing and
loading of offsets from the byte string is now handled by the macros that are
defined here.
The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
the config.h file, but can be overridden by using -D on the command line. This
@@ -208,6 +238,7 @@ Standard C system should have one. */
#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
#endif
/* These are the public options that can change during matching. */
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
@@ -216,12 +247,13 @@ Standard C system should have one. */
but skip the top bit so we can use ints for convenience without getting tangled
with negative values. The public options defined in pcre.h start at the least
significant end. Make sure they don't overlap, though now that we have expanded
to four bytes there is plenty of space. */
to four bytes, there is plenty of space. */
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
/* Options for the "extra" block produced by pcre_study(). */
@@ -233,10 +265,11 @@ time, run time or study time, respectively. */
#define PUBLIC_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
PCRE_PARTIAL)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
@@ -296,12 +329,13 @@ definitions below, up to ESC_z. There's a dummy for OP_ANY because it
corresponds to "." rather than an escape sequence. The final one must be
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
tests in the code for an escape greater than ESC_b and less than ESC_Z to
detect the types that may be repeated. These are the types that consume a
character. If any new escapes are put in between that don't consume a
detect the types that may be repeated. These are the types that consume
characters. If any new escapes are put in between that don't consume a
character, that code will have to change. */
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
ESC_Q, ESC_REF };
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
@@ -312,6 +346,8 @@ contain UTF-8 characters with values greater than 255. */
#define XCL_END 0 /* Marks end of individual items */
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
#define XCL_PROP 3 /* Unicode property (one property code) follows */
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
@@ -337,100 +373,112 @@ enum {
OP_WORDCHAR, /* 10 \w */
OP_ANY, /* 11 Match any character */
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_EODN, /* 13 End of data or \n at end of data: \Z. */
OP_EOD, /* 14 End of data: \z */
OP_NOTPROP, /* 13 \P (not Unicode property) */
OP_PROP, /* 14 \p (Unicode property) */
OP_EXTUNI, /* 15 \X (extended Unicode sequence */
OP_EODN, /* 16 End of data or \n at end of data: \Z. */
OP_EOD, /* 17 End of data: \z */
OP_OPT, /* 15 Set runtime options */
OP_CIRC, /* 16 Start of line - varies with multiline switch */
OP_DOLL, /* 17 End of line - varies with multiline switch */
OP_CHARS, /* 18 Match string of characters */
OP_NOT, /* 19 Match anything but the following char */
OP_OPT, /* 18 Set runtime options */
OP_CIRC, /* 19 Start of line - varies with multiline switch */
OP_DOLL, /* 20 End of line - varies with multiline switch */
OP_CHAR, /* 21 Match one character, casefully */
OP_CHARNC, /* 22 Match one character, caselessly */
OP_NOT, /* 23 Match anything but the following char */
OP_STAR, /* 20 The maximizing and minimizing versions of */
OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */
OP_PLUS, /* 22 the minimizing one second. */
OP_MINPLUS, /* 23 This first set applies to single characters */
OP_QUERY, /* 24 */
OP_MINQUERY, /* 25 */
OP_UPTO, /* 26 From 0 to n matches */
OP_MINUPTO, /* 27 */
OP_EXACT, /* 28 Exactly n matches */
OP_STAR, /* 24 The maximizing and minimizing versions of */
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
OP_PLUS, /* 26 the minimizing one second. */
OP_MINPLUS, /* 27 This first set applies to single characters */
OP_QUERY, /* 28 */
OP_MINQUERY, /* 29 */
OP_UPTO, /* 30 From 0 to n matches */
OP_MINUPTO, /* 31 */
OP_EXACT, /* 32 Exactly n matches */
OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */
OP_NOTPLUS, /* 31 the minimizing one second. */
OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */
OP_NOTQUERY, /* 33 */
OP_NOTMINQUERY, /* 34 */
OP_NOTUPTO, /* 35 From 0 to n matches */
OP_NOTMINUPTO, /* 36 */
OP_NOTEXACT, /* 37 Exactly n matches */
OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
OP_NOTPLUS, /* 35 the minimizing one second. */
OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
OP_NOTQUERY, /* 37 */
OP_NOTMINQUERY, /* 38 */
OP_NOTUPTO, /* 39 From 0 to n matches */
OP_NOTMINUPTO, /* 40 */
OP_NOTEXACT, /* 41 Exactly n matches */
OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */
OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 42 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 43 */
OP_TYPEUPTO, /* 44 From 0 to n matches */
OP_TYPEMINUPTO, /* 45 */
OP_TYPEEXACT, /* 46 Exactly n matches */
OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 47 */
OP_TYPEUPTO, /* 48 From 0 to n matches */
OP_TYPEMINUPTO, /* 49 */
OP_TYPEEXACT, /* 50 Exactly n matches */
OP_CRSTAR, /* 47 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 49 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */
OP_CRQUERY, /* 51 These are for character classes and back refs */
OP_CRMINQUERY, /* 52 */
OP_CRRANGE, /* 53 These are different to the three seta above. */
OP_CRMINRANGE, /* 54 */
OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 53 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
OP_CRQUERY, /* 55 These are for character classes and back refs */
OP_CRMINQUERY, /* 56 */
OP_CRRANGE, /* 57 These are different to the three sets above. */
OP_CRMINRANGE, /* 58 */
OP_CLASS, /* 55 Match a character class, chars < 256 only */
OP_NCLASS, /* 56 Same, but the bitmap was created from a negative
OP_CLASS, /* 59 Match a character class, chars < 256 only */
OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
class - the difference is relevant only when a UTF-8
character > 255 is encountered. */
OP_XCLASS, /* 57 Extended class for handling UTF-8 chars within the
OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */
OP_REF, /* 58 Match a back reference */
OP_RECURSE, /* 59 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 60 Call out to external function if provided */
OP_REF, /* 62 Match a back reference */
OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 64 Call out to external function if provided */
OP_ALT, /* 61 Start of alternation */
OP_KET, /* 62 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 63 These two must remain together and in this */
OP_KETRMIN, /* 64 order. They are for groups the repeat for ever. */
OP_ALT, /* 65 Start of alternation */
OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 67 These two must remain together and in this */
OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
/* The assertions must come before ONCE and COND */
OP_ASSERT, /* 65 Positive lookahead */
OP_ASSERT_NOT, /* 66 Negative lookahead */
OP_ASSERTBACK, /* 67 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
OP_REVERSE, /* 69 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 69 Positive lookahead */
OP_ASSERT_NOT, /* 70 Negative lookahead */
OP_ASSERTBACK, /* 71 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
/* ONCE and COND must come after the assertions, with ONCE first, as there's
a test for >= ONCE for a subpattern that isn't an assertion. */
OP_ONCE, /* 70 Once matched, don't back up into the subpattern */
OP_COND, /* 71 Conditional group */
OP_CREF, /* 72 Used to hold an extraction string number (cond ref) */
OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
OP_COND, /* 75 Conditional group */
OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
OP_BRAZERO, /* 73 These two must remain together and in this */
OP_BRAMINZERO, /* 74 order. */
OP_BRAZERO, /* 77 These two must remain together and in this */
OP_BRAMINZERO, /* 78 order. */
OP_BRANUMBER, /* 75 Used for extracting brackets whose number is greater
OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
than can fit into an opcode. */
OP_BRA /* 76 This and greater values are used for brackets that
extract substrings up to a basic limit. After that,
use is made of OP_BRANUMBER. */
OP_BRA /* 80 This and greater values are used for brackets that
extract substrings up to EXTRACT_BASIC_MAX. After
that, use is made of OP_BRANUMBER. */
};
/* WARNING: There is an implicit assumption in study.c that all opcodes are
less than 128 in value. This makes handling UTF-8 character sequences easier.
*/
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
character sequences easier. */
/* The highest extraction number before we have to start using additional
bytes. (Originally PCRE didn't have support for extraction counts highter than
this number.) The value is limited by the number of opcodes left after OP_BRA,
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
opcodes. */
#define EXTRACT_BASIC_MAX 100
/* This macro defines textual names for all the opcodes. There are used only
@@ -439,8 +487,10 @@ macro is referenced only in printint.c. */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \
"Opt", "^", "$", "chars", "not", \
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
"notprop", "prop", "extuni", \
"\\Z", "\\z", \
"Opt", "^", "$", "char", "charnc", "not", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
@@ -463,8 +513,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */
#define OP_LENGTHS \
1, /* End */ \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
1, 1, 1, 1, 2, 1, 1, /* Any, Anybyte, \Z, \z, Opt, ^, $ */ \
2, /* Chars - the minimum length */ \
1, 1, /* Any, Anybyte */ \
2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
2, /* Char - the minimum length */ \
2, /* Charnc - the minimum length */ \
2, /* not */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
@@ -483,7 +536,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
0, /* XCLASS - variable length */ \
3, /* REF */ \
1+LINK_SIZE, /* RECURSE */ \
2, /* CALLOUT */ \
2+2*LINK_SIZE, /* CALLOUT */ \
1+LINK_SIZE, /* Alt */ \
1+LINK_SIZE, /* Ket */ \
1+LINK_SIZE, /* KetRmax */ \
@@ -501,14 +554,6 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE /* BRA */ \
/* The highest extraction number before we have to start using additional
bytes. (Originally PCRE didn't have support for extraction counts highter than
this number.) The value is limited by the number of opcodes left after OP_BRA,
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
opcodes. */
#define EXTRACT_BASIC_MAX 150
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
#define CREF_RECURSE 0xffff
@@ -554,7 +599,7 @@ just to accommodate the POSIX wrapper. */
#define ERR34 "character value in \\x{...} sequence is too large"
#define ERR35 "invalid condition (?(0)"
#define ERR36 "\\C not allowed in lookbehind assertion"
#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
#define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
#define ERR38 "number after (?C is > 255"
#define ERR39 "closing ) for (?C expected"
#define ERR40 "recursive call could loop indefinitely"
@@ -562,37 +607,51 @@ just to accommodate the POSIX wrapper. */
#define ERR42 "syntax error after (?P"
#define ERR43 "two named groups have the same name"
#define ERR44 "invalid UTF-8 string"
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
However, we leave the interface to the outside world as char *, because that
should make things easier for callers. We define a short type for unsigned char
to save lots of typing. I tried "uchar", but it causes problems on Digital
Unix, where it is defined in sys/types, so use "uschar" instead. */
typedef unsigned char uschar;
#define ERR45 "support for \\P, \\p, and \\X has not been compiled"
#define ERR46 "malformed \\P or \\p sequence"
#define ERR47 "unknown property name after \\P or \\p"
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. */
code vector run on as long as necessary after the end. We store an explicit
offset to the name table so that if a regex is compiled on one host, saved, and
then run on another where the size of pointers is different, all might still
be well. For the case of compiled-on-4 and run-on-8, we include an extra
pointer that is always NULL. For future-proofing, we also include a few dummy
fields - even though you can never get this planning right!
NOTE NOTE NOTE:
Because people can now save and re-use compiled patterns, any additions to this
structure should be made at the end, and something earlier (e.g. a new
flag in the options or one of the dummy fields) should indicate that the new
fields are present. Currently PCRE always sets the dummy fields to zero.
NOTE NOTE NOTE:
*/
typedef struct real_pcre {
unsigned long int magic_number;
size_t size; /* Total that was malloced */
const unsigned char *tables; /* Pointer to tables */
unsigned long int options;
unsigned short int top_bracket;
unsigned short int top_backref;
unsigned short int first_byte;
unsigned short int req_byte;
unsigned short int name_entry_size; /* Size of any name items; 0 => none */
unsigned short int name_count; /* Number of name items */
pcre_uint32 magic_number;
pcre_uint32 size; /* Total that was malloced */
pcre_uint32 options;
pcre_uint32 dummy1; /* For future use, maybe */
pcre_uint16 top_bracket;
pcre_uint16 top_backref;
pcre_uint16 first_byte;
pcre_uint16 req_byte;
pcre_uint16 name_table_offset; /* Offset to name table that follows */
pcre_uint16 name_entry_size; /* Size of any name items */
pcre_uint16 name_count; /* Number of name items */
pcre_uint16 dummy2; /* For future use, maybe */
const unsigned char *tables; /* Pointer to tables or NULL for std */
const unsigned char *nullpad; /* NULL padding */
} real_pcre;
/* The format of the block used to store data from pcre_study(). */
/* The format of the block used to store data from pcre_study(). The same
remark (see NOTE above) about extending this structure applies. */
typedef struct pcre_study_data {
size_t size; /* Total that was malloced */
uschar options;
pcre_uint32 size; /* Total that was malloced */
pcre_uint32 options;
uschar start_bits[32];
} pcre_study_data;
@@ -605,12 +664,14 @@ typedef struct compile_data {
const uschar *cbits; /* Points to character type table */
const uschar *ctypes; /* Points to table of type maps */
const uschar *start_code; /* The start of the compiled code */
const uschar *start_pattern; /* The start of the pattern */
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL nopartial; /* Set TRUE if partial won't work */
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
@@ -660,6 +721,8 @@ typedef struct match_data {
BOOL utf8; /* UTF8 flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL partial; /* PARTIAL flag */
BOOL hitend; /* Hit the end of the subject at some point */
const uschar *start_code; /* For use when recursing */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of the subject string */
+19
View File
@@ -0,0 +1,19 @@
LIBRARY libpcre
EXPORTS
pcre_malloc
pcre_free
pcre_config
pcre_callout
pcre_compile
pcre_copy_substring
pcre_exec
pcre_get_substring
pcre_get_stringnumber
pcre_get_substring_list
pcre_free_substring
pcre_free_substring_list
pcre_info
pcre_fullinfo
pcre_maketables
pcre_study
pcre_version
+24
View File
@@ -0,0 +1,24 @@
LIBRARY libpcreposix
EXPORTS
pcre_malloc
pcre_free
pcre_config
pcre_callout
pcre_compile
pcre_copy_substring
pcre_exec
pcre_get_substring
pcre_get_stringnumber
pcre_get_substring_list
pcre_free_substring
pcre_free_substring_list
pcre_info
pcre_fullinfo
pcre_maketables
pcre_study
pcre_version
regcomp
regexec
regerror
regfree
+22 -16
View File
@@ -8,29 +8,35 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
Copyright (c) 1997-2003 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
See the file Tech.Notes for some information on the internals.
*/
+1607 -716
View File
File diff suppressed because it is too large Load Diff
+52 -6
View File
@@ -2,7 +2,39 @@
* Perl-Compatible Regular Expressions *
*************************************************/
/* Copyright (c) 1997-2003 University of Cambridge */
/* In its original form, this is the .in file that is transformed by
"configure" into pcre.h.
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifndef _PCRE_H
#define _PCRE_H
@@ -12,9 +44,9 @@ make changes to pcre.in. */
#include "php_compat.h"
#define PCRE_MAJOR 4
#define PCRE_MINOR 5
#define PCRE_DATE 01-December-2003
#define PCRE_MAJOR 5
#define PCRE_MINOR 0
#define PCRE_DATE 13-Sep-2004
/* Win32 uses DLL by default */
@@ -60,6 +92,8 @@ extern "C" {
#define PCRE_UTF8 0x0800
#define PCRE_NO_AUTO_CAPTURE 0x1000
#define PCRE_NO_UTF8_CHECK 0x2000
#define PCRE_AUTO_CALLOUT 0x4000
#define PCRE_PARTIAL 0x8000
/* Exec-time and get/set-time error codes */
@@ -74,6 +108,10 @@ extern "C" {
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
#define PCRE_ERROR_BADUTF8 (-10)
#define PCRE_ERROR_BADUTF8_OFFSET (-11)
#define PCRE_ERROR_PARTIAL (-12)
#define PCRE_ERROR_BADPARTIAL (-13)
#define PCRE_ERROR_INTERNAL (-14)
#define PCRE_ERROR_BADCOUNT (-15)
/* Request types for pcre_fullinfo() */
@@ -89,6 +127,7 @@ extern "C" {
#define PCRE_INFO_NAMECOUNT 8
#define PCRE_INFO_NAMETABLE 9
#define PCRE_INFO_STUDYSIZE 10
#define PCRE_INFO_DEFAULT_TABLES 11
/* Request types for pcre_config() */
@@ -98,12 +137,14 @@ extern "C" {
#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
#define PCRE_CONFIG_MATCH_LIMIT 4
#define PCRE_CONFIG_STACKRECURSE 5
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
/* Bit flags for the pcre_extra structure */
#define PCRE_EXTRA_STUDY_DATA 0x0001
#define PCRE_EXTRA_MATCH_LIMIT 0x0002
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
#define PCRE_EXTRA_TABLES 0x0008
/* Types */
@@ -111,13 +152,15 @@ struct real_pcre; /* declaration; the definition is private */
typedef struct real_pcre pcre;
/* The structure for passing additional data to pcre_exec(). This is defined in
such as way as to be extensible. */
such as way as to be extensible. Always add new fields at the end, in order to
remain compatible. */
typedef struct pcre_extra {
unsigned long int flags; /* Bits for which fields are set */
void *study_data; /* Opaque data from pcre_study() */
unsigned long int match_limit; /* Maximum number of calls to match() */
void *callout_data; /* Data passed back in callouts */
const unsigned char *tables; /* Pointer to character tables */
} pcre_extra;
/* The structure for passing out data via the pcre_callout_function. We use a
@@ -133,10 +176,13 @@ typedef struct pcre_callout_block {
const char *subject; /* The subject being matched */
int subject_length; /* The length of the subject */
int start_match; /* Offset to start of this match attempt */
int current_position; /* Where we currently are */
int current_position; /* Where we currently are in the subject */
int capture_top; /* Max current capture */
int capture_last; /* Most recently closed capture */
void *callout_data; /* Data passed in with the call */
/* ------------------- Added for Version 1 -------------------------- */
int pattern_position; /* Offset to next item in the pattern */
int next_item_length; /* Length of next item in the pattern */
/* ------------------------------------------------------------------ */
} pcre_callout_block;
+324
View File
@@ -0,0 +1,324 @@
/*************************************************
* PCRE DEMONSTRATION PROGRAM *
*************************************************/
/* This is a demonstration program to illustrate the most straightforward ways
of calling the PCRE regular expression library from a C program. See the
pcresample documentation for a short discussion.
Compile thuswise:
gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
library files for PCRE are installed on your system. Only some operating
systems (e.g. Solaris) use the -R option.
*/
#include <stdio.h>
#include <string.h>
#include <pcre.h>
#define OVECCOUNT 30 /* should be a multiple of 3 */
int main(int argc, char **argv)
{
pcre *re;
const char *error;
char *pattern;
char *subject;
unsigned char *name_table;
int erroffset;
int find_all;
int namecount;
int name_entry_size;
int ovector[OVECCOUNT];
int subject_length;
int rc, i;
/**************************************************************************
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
return 1;
}
pattern = argv[i];
subject = argv[i+1];
subject_length = (int)strlen(subject);
/*************************************************************************
* Now we are going to compile the regular expression pattern, and handle *
* and errors that are detected. *
*************************************************************************/
re = pcre_compile(
pattern, /* the pattern */
0, /* default options */
&error, /* for error message */
&erroffset, /* for error offset */
NULL); /* use default character tables */
/* Compilation failed: print the error message and exit */
if (re == NULL)
{
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
/*************************************************************************
* If the compilation succeeded, we call PCRE again, in order to do a *
* pattern match against the subject string. This does just ONE match. If *
* further matching is needed, it will be done below. *
*************************************************************************/
rc = pcre_exec(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
ovector, /* output vector for substring information */
OVECCOUNT); /* number of elements in the output vector */
/* Matching failed: handle error cases */
if (rc < 0)
{
switch(rc)
{
case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
/*
Handle other special cases if you like
*/
default: printf("Matching error %d\n", rc); break;
}
free(re); /* Release memory used for the compiled pattern */
return 1;
}
/* Match succeded */
printf("\nMatch succeeded at offset %d\n", ovector[0]);
/*************************************************************************
* We have found the first match within the subject string. If the output *
* vector wasn't big enough, set its size to the maximum. Then output any *
* substrings that were captured. *
*************************************************************************/
/* The output vector wasn't big enough */
if (rc == 0)
{
rc = OVECCOUNT/3;
printf("ovector only has room for %d captured substrings\n", rc - 1);
}
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
for (i = 0; i < rc; i++)
{
char *substring_start = subject + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, substring_length, substring_start);
}
/**************************************************************************
* That concludes the basic part of this demonstration program. We have *
* compiled a pattern, and performed a single match. The code that follows *
* first shows how to access named substrings, and then how to code for *
* repeated matches on the same subject. *
**************************************************************************/
/* See if there are any named substrings, and if so, show them by name. First
we have to extract the count of named parentheses from the pattern. */
(void)pcre_fullinfo(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
PCRE_INFO_NAMECOUNT, /* number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\n"); else
{
unsigned char *tabptr;
printf("Named substrings\n");
/* Before we can access the substrings, we must extract the table for
translating names to numbers, and the size of each entry in the table. */
(void)pcre_fullinfo(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
PCRE_INFO_NAMETABLE, /* address of the table */
&name_table); /* where to put the answer */
(void)pcre_fullinfo(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
&name_entry_size); /* where to put the answer */
/* Now we can scan the table and, for each entry, print the number, the name,
and the substring itself. */
tabptr = name_table;
for (i = 0; i < namecount; i++)
{
int n = (tabptr[0] << 8) | tabptr[1];
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
/*************************************************************************
* If the "-g" option was given on the command line, we want to continue *
* to search for additional matches in the subject string, in a similar *
* way to the /g option in Perl. This turns out to be trickier than you *
* might think because of the possibility of matching an empty string. *
* What happens is as follows: *
* *
* If the previous match was NOT for an empty string, we can just start *
* the next match at the end of the previous one. *
* *
* If the previous match WAS for an empty string, we can't do that, as it *
* would lead to an infinite loop. Instead, a special call of pcre_exec() *
* is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first *
* of these tells PCRE that an empty string is not a valid match; other *
* possibilities must be tried. The second flag restricts PCRE to one *
* match attempt at the initial string position. If this match succeeds, *
* an alternative to the empty string match has been found, and we can *
* proceed round the loop. *
*************************************************************************/
if (!find_all)
{
free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
}
/* Loop for second and subsequent matches */
for (;;)
{
int options = 0; /* Normally no options */
int start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
same point to see if a non-empty match can be found. */
if (ovector[0] == ovector[1])
{
if (ovector[0] == subject_length) break;
options = PCRE_NOTEMPTY | PCRE_ANCHORED;
}
/* Run the next matching operation */
rc = pcre_exec(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
start_offset, /* starting offset in the subject */
options, /* options */
ovector, /* output vector for substring information */
OVECCOUNT); /* number of elements in the output vector */
/* This time, a result of NOMATCH isn't an error. If the value in "options"
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
Perl does: advance the matching position by one, and continue. We do this by
setting the "end of previous match" offset, because that is picked up at the
top of the loop as the point at which to start again. */
if (rc == PCRE_ERROR_NOMATCH)
{
if (options == 0) break;
ovector[1] = start_offset + 1;
continue; /* Go round the loop again */
}
/* Other matching errors are not recoverable. */
if (rc < 0)
{
printf("Matching error %d\n", rc);
free(re); /* Release memory used for the compiled pattern */
return 1;
}
/* Match succeded */
printf("\nMatch succeeded again at offset %d\n", ovector[0]);
/* The match succeeded, but the output vector wasn't big enough. */
if (rc == 0)
{
rc = OVECCOUNT/3;
printf("ovector only has room for %d captured substrings\n", rc - 1);
}
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
for (i = 0; i < rc; i++)
{
char *substring_start = subject + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, substring_length, substring_start);
}
if (namecount <= 0) printf("No named substrings\n"); else
{
unsigned char *tabptr = name_table;
printf("Named substrings\n");
for (i = 0; i < namecount; i++)
{
int n = (tabptr[0] << 8) | tabptr[1];
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
} /* End of loop to find second and subsequent matches */
printf("\n");
free(re); /* Release memory used for the compiled pattern */
return 0;
}
/* End of pcredemo.c */
+32 -1
View File
@@ -4,7 +4,38 @@
/* This is a grep program that uses the PCRE regular expression library to do
its pattern matching. On a Unix or Win32 system it can recurse into
directories. */
directories.
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#include <ctype.h>
#include <stdio.h>
+27 -16
View File
@@ -15,23 +15,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
@@ -48,7 +56,7 @@ static const char *const estring[] = {
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR29, ERR29, ERR30,
ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
ERR41, ERR42, ERR43, ERR44 };
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
static const int eint[] = {
REG_EESCAPE, /* "\\ at end of pattern" */
@@ -87,14 +95,17 @@ static const int eint[] = {
REG_BADPAT, /* "character value in \x{...} sequence is too large" */
REG_BADPAT, /* "invalid condition (?(0)" */
REG_BADPAT, /* "\\C not allowed in lookbehind assertion" */
REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" */
REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\U, or \\u" */
REG_BADPAT, /* "number after (?C is > 255" */
REG_BADPAT, /* "closing ) for (?C expected" */
REG_BADPAT, /* "recursive call could loop indefinitely" */
REG_BADPAT, /* "unrecognized character after (?P" */
REG_BADPAT, /* "syntax error after (?P" */
REG_BADPAT, /* "two named groups have the same name" */
REG_BADPAT /* "invalid UTF-8 string" */
REG_BADPAT, /* "invalid UTF-8 string" */
REG_BADPAT, /* "support for \\P, \\p, and \\X has not been compiled" */
REG_BADPAT, /* "malformed \\P or \\p sequence" */
REG_BADPAT /* "unknown property name after \\P or \\p" */
};
/* Table of texts corresponding to POSIX error codes */
+32 -3
View File
@@ -2,14 +2,43 @@
* Perl-Compatible Regular Expressions *
*************************************************/
/* Copyright (c) 1997-2003 University of Cambridge */
#ifndef _PCREPOSIX_H
#define _PCREPOSIX_H
/* This is the header for the POSIX wrapper interface to the PCRE Perl-
Compatible Regular Expression library. It defines the things POSIX says should
be there. I hope. */
be there. I hope.
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* Have to include stdlib.h in order to ensure that size_t is defined. */
+380 -77
View File
@@ -4,7 +4,37 @@
/* This program was hacked up as a tester for PCRE. I really should have
written it more tidily in the first place. Will I ever learn? It has grown and
been extended and consequently is now rather untidy in places. */
been extended and consequently is now rather untidy in places.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#include <ctype.h>
#include <stdio.h>
@@ -12,6 +42,7 @@ been extended and consequently is now rather untidy in places. */
#include <stdlib.h>
#include <time.h>
#include <locale.h>
#include <errno.h>
/* We need the internal info for displaying the results of pcre_study(). Also
for getting the opcodes for showing compiled code. */
@@ -35,9 +66,10 @@ Makefile. */
#endif
#endif
#define LOOPREPEAT 50000
#define LOOPREPEAT 500000
#define BUFFER_SIZE 30000
#define PBUFFER_SIZE BUFFER_SIZE
#define DBUFFER_SIZE BUFFER_SIZE
@@ -52,6 +84,8 @@ static int show_malloc;
static int use_utf8;
static size_t gotten_store;
static uschar *pbuffer = NULL;
static const int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
@@ -71,10 +105,13 @@ static const int utf8_table3[] = {
/* The code for doing this is held in a separate file that is also included in
pcre.c when it is compiled with the debug switch. It defines a function called
print_internals(), which uses a table of opcode lengths defined by the macro
OP_LENGTHS, whose name must be OP_lengths. */
OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
Unicode property names to numbers; this is kept in a separate file. */
static uschar OP_lengths[] = { OP_LENGTHS };
#include "ucp.h"
#include "ucptypetable.c"
#include "printint.c"
@@ -269,7 +306,7 @@ data is not zero. */
static int callout(pcre_callout_block *cb)
{
FILE *f = (first_callout | callout_extra)? outfile : NULL;
int i, pre_start, post_start;
int i, pre_start, post_start, subject_length;
if (callout_extra)
{
@@ -300,16 +337,26 @@ pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
cb->current_position - cb->start_match, f);
subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
(void)pchars((unsigned char *)(cb->subject + cb->current_position),
cb->subject_length - cb->current_position, f);
if (f != NULL) fprintf(f, "\n");
/* Always print appropriate indicators, with callout number if not already
shown */
shown. For automatic callouts, show the pattern offset. */
if (callout_extra) fprintf(outfile, " ");
else fprintf(outfile, "%3d ", cb->callout_number);
if (cb->callout_number == 255)
{
fprintf(outfile, "%+3d ", cb->pattern_position);
if (cb->pattern_position > 99) fprintf(outfile, "\n ");
}
else
{
if (callout_extra) fprintf(outfile, " ");
else fprintf(outfile, "%3d ", cb->callout_number);
}
for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
fprintf(outfile, "^");
@@ -320,6 +367,12 @@ if (post_start > 0)
fprintf(outfile, "^");
}
for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
fprintf(outfile, " ");
fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
pbuffer + cb->pattern_position);
fprintf(outfile, "\n");
first_callout = 0;
@@ -395,6 +448,23 @@ if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
/*************************************************
* Byte flipping function *
*************************************************/
static long int
byteflip(long int value, int n)
{
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
return ((value & 0x000000ff) << 24) |
((value & 0x0000ff00) << 8) |
((value & 0x00ff0000) >> 8) |
((value & 0xff000000) >> 24);
}
/*************************************************
* Main Program *
*************************************************/
@@ -429,8 +499,15 @@ when I am debugging. */
buffer = (unsigned char *)malloc(BUFFER_SIZE);
dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
/* Static so that new_malloc can use it. */
/* The outfile variable is static so that new_malloc can use it. The _setmode()
stuff is some magic that I don't understand, but which apparently does good
things in Windows. It's related to line terminations. */
#if defined(_WIN32) || defined(WIN32)
_setmode( _fileno( stdout ), 0x8000 );
#endif /* defined(_WIN32) || defined(WIN32) */
outfile = stdout;
@@ -462,6 +539,8 @@ while (argc > 1 && argv[op][0] == '-')
printf("Compiled with\n");
(void)pcre_config(PCRE_CONFIG_UTF8, &rc);
printf(" %sUTF-8 support\n", rc? "" : "No ");
(void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
printf(" %sUnicode properties support\n", rc? "" : "No ");
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
@@ -481,11 +560,12 @@ while (argc > 1 && argv[op][0] == '-')
printf(" -C show PCRE compile-time options and exit\n");
printf(" -d debug: show compiled code; implies -i\n"
" -i show information about compiled pattern\n"
" -m output memory used information\n"
" -o <n> set size of offsets vector to <n>\n");
#if !defined NOPOSIX
printf(" -p use POSIX interface\n");
#endif
printf(" -s output store information\n"
printf(" -s output store (memory) used information\n"
" -t time compilation and execution\n");
return 1;
}
@@ -508,7 +588,7 @@ if (offsets == NULL)
if (argc > 1)
{
infile = fopen(argv[op], "r");
infile = fopen(argv[op], "rb");
if (infile == NULL)
{
printf("** Failed to open %s\n", argv[op]);
@@ -518,7 +598,7 @@ if (argc > 1)
if (argc > 2)
{
outfile = fopen(argv[op+1], "w");
outfile = fopen(argv[op+1], "wb");
if (outfile == NULL)
{
printf("** Failed to open %s\n", argv[op+1]);
@@ -551,13 +631,17 @@ while (!done)
const char *error;
unsigned char *p, *pp, *ppp;
unsigned char *to_file = NULL;
const unsigned char *tables = NULL;
unsigned long int true_size, true_study_size = 0;
size_t size, regex_gotten_store;
int do_study = 0;
int do_debug = debug;
int do_G = 0;
int do_g = 0;
int do_showinfo = showinfo;
int do_showrest = 0;
int do_flip = 0;
int erroroffset, len, delimiter;
use_utf8 = 0;
@@ -571,8 +655,93 @@ while (!done)
while (isspace(*p)) p++;
if (*p == 0) continue;
/* Get the delimiter and seek the end of the pattern; if is isn't
complete, read more. */
/* See if the pattern is to be loaded pre-compiled from a file. */
if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
{
unsigned long int magic;
uschar sbuf[8];
FILE *f;
p++;
pp = p + (int)strlen((char *)p);
while (isspace(pp[-1])) pp--;
*pp = 0;
f = fopen((char *)p, "rb");
if (f == NULL)
{
fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
continue;
}
if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
true_size =
(sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
true_study_size =
(sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
re = (real_pcre *)new_malloc(true_size);
regex_gotten_store = gotten_store;
if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
magic = ((real_pcre *)re)->magic_number;
if (magic != MAGIC_NUMBER)
{
if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
{
do_flip = 1;
}
else
{
fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
fclose(f);
continue;
}
}
fprintf(outfile, "Compiled regex%s loaded from %s\n",
do_flip? " (byte-inverted)" : "", p);
/* Need to know if UTF-8 for printing data strings */
new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
use_utf8 = (options & PCRE_UTF8) != 0;
/* Now see if there is any following study data */
if (true_study_size != 0)
{
pcre_study_data *psd;
extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
extra->flags = PCRE_EXTRA_STUDY_DATA;
psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
extra->study_data = psd;
if (fread(psd, 1, true_study_size, f) != true_study_size)
{
FAIL_READ:
fprintf(outfile, "Failed to read data from %s\n", p);
if (extra != NULL) new_free(extra);
if (re != NULL) new_free(re);
fclose(f);
continue;
}
fprintf(outfile, "Study data loaded from %s\n", p);
do_study = 1; /* To get the data output if requested */
}
else fprintf(outfile, "No study data\n");
fclose(f);
goto SHOW_INFO;
}
/* In-line pattern (the usual case). Get the delimiter and seek the end of
the pattern; if is isn't complete, read more. */
delimiter = *p++;
@@ -617,9 +786,11 @@ while (!done)
if (pp[1] == '\\') *pp++ = '\\';
/* Terminate the pattern at the delimiter */
/* Terminate the pattern at the delimiter, and save a copy of the pattern
for callouts. */
*pp++ = 0;
strcpy((char *)pbuffer, (char *)p);
/* Look for options after final delimiter */
@@ -639,8 +810,10 @@ while (!done)
case '+': do_showrest = 1; break;
case 'A': options |= PCRE_ANCHORED; break;
case 'C': options |= PCRE_AUTO_CALLOUT; break;
case 'D': do_debug = do_showinfo = 1; break;
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
case 'F': do_flip = 1; break;
case 'G': do_G = 1; break;
case 'I': do_showinfo = 1; break;
case 'M': log_store = 1; break;
@@ -669,7 +842,15 @@ while (!done)
pp = ppp;
break;
case '>':
to_file = pp;
while (*pp != 0) pp++;
while (isspace(pp[-1])) pp--;
*pp = 0;
break;
case '\n': case ' ': break;
default:
fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
goto SKIP_DATA;
@@ -685,6 +866,7 @@ while (!done)
{
int rc;
int cflags = 0;
if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
rc = regcomp(&preg, (char *)p, cflags);
@@ -759,14 +941,77 @@ while (!done)
sizeof(real_pcre) -
((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
/* Extract the size for possible writing before possibly flipping it,
and remember the store that was got. */
true_size = ((real_pcre *)re)->size;
regex_gotten_store = gotten_store;
/* If /S was present, study the regexp to generate additional info to
help with the matching. */
if (do_study)
{
if (timeit)
{
register int i;
clock_t time_taken;
clock_t start_time = clock();
for (i = 0; i < LOOPREPEAT; i++)
extra = pcre_study(re, study_options, &error);
time_taken = clock() - start_time;
if (extra != NULL) free(extra);
fprintf(outfile, " Study time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
(double)CLOCKS_PER_SEC);
}
extra = pcre_study(re, study_options, &error);
if (error != NULL)
fprintf(outfile, "Failed to study: %s\n", error);
else if (extra != NULL)
true_study_size = ((pcre_study_data *)(extra->study_data))->size;
}
/* If the 'F' option was present, we flip the bytes of all the integer
fields in the regex data block and the study block. This is to make it
possible to test PCRE's handling of byte-flipped patterns, e.g. those
compiled on a different architecture. */
if (do_flip)
{
real_pcre *rre = (real_pcre *)re;
rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
rre->size = byteflip(rre->size, sizeof(rre->size));
rre->options = byteflip(rre->options, sizeof(rre->options));
rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
rre->name_table_offset = byteflip(rre->name_table_offset,
sizeof(rre->name_table_offset));
rre->name_entry_size = byteflip(rre->name_entry_size,
sizeof(rre->name_entry_size));
rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
if (extra != NULL)
{
pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
rsd->size = byteflip(rsd->size, sizeof(rsd->size));
rsd->options = byteflip(rsd->options, sizeof(rsd->options));
}
}
/* Extract information from the compiled data if required */
SHOW_INFO:
if (do_showinfo)
{
unsigned long int get_options;
unsigned long int get_options, all_options;
int old_first_char, old_options, old_count;
int count, backrefmax, first_char, need_char;
int nameentrysize, namecount;
const uschar *nametable;
size_t size;
if (do_debug)
{
@@ -802,9 +1047,9 @@ while (!done)
get_options, old_options);
}
if (size != gotten_store) fprintf(outfile,
if (size != regex_gotten_store) fprintf(outfile,
"Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
size, gotten_store);
size, regex_gotten_store);
fprintf(outfile, "Capturing subpattern count = %d\n", count);
if (backrefmax > 0)
@@ -822,6 +1067,18 @@ while (!done)
}
}
/* The NOPARTIAL bit is a private bit in the options, so we have
to fish it out via out back door */
all_options = ((real_pcre *)re)->options;
if (do_flip)
{
all_options = byteflip(all_options, sizeof(all_options));
}
if ((all_options & PCRE_NOPARTIAL) != 0)
fprintf(outfile, "Partial matching not supported\n");
if (get_options == 0) fprintf(outfile, "No options\n");
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
@@ -871,77 +1128,103 @@ while (!done)
else
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
}
}
/* If /S was present, study the regexp to generate additional info to
help with the matching. */
if (do_study)
{
if (timeit)
{
register int i;
clock_t time_taken;
clock_t start_time = clock();
for (i = 0; i < LOOPREPEAT; i++)
extra = pcre_study(re, study_options, &error);
time_taken = clock() - start_time;
if (extra != NULL) free(extra);
fprintf(outfile, " Study time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
(double)CLOCKS_PER_SEC);
}
extra = pcre_study(re, study_options, &error);
if (error != NULL)
fprintf(outfile, "Failed to study: %s\n", error);
else if (extra == NULL)
fprintf(outfile, "Study returned NULL\n");
/* Don't output study size; at present it is in any case a fixed
value, but it varies, depending on the computer architecture, and
so messes up the test suite. */
so messes up the test suite. (And with the /F option, it might be
flipped.) */
else if (do_showinfo)
if (do_study)
{
size_t size;
uschar *start_bits = NULL;
new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
/* fprintf(outfile, "Study size = %d\n", size); */
if (start_bits == NULL)
fprintf(outfile, "No starting character set\n");
if (extra == NULL)
fprintf(outfile, "Study returned NULL\n");
else
{
int i;
int c = 24;
fprintf(outfile, "Starting character set: ");
for (i = 0; i < 256; i++)
uschar *start_bits = NULL;
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
if (start_bits == NULL)
fprintf(outfile, "No starting byte set\n");
else
{
if ((start_bits[i/8] & (1<<(i%8))) != 0)
int i;
int c = 24;
fprintf(outfile, "Starting byte set: ");
for (i = 0; i < 256; i++)
{
if (c > 75)
if ((start_bits[i/8] & (1<<(i&7))) != 0)
{
fprintf(outfile, "\n ");
c = 2;
}
if (isprint(i) && i != ' ')
{
fprintf(outfile, "%c ", i);
c += 2;
}
else
{
fprintf(outfile, "\\x%02x ", i);
c += 5;
if (c > 75)
{
fprintf(outfile, "\n ");
c = 2;
}
if (isprint(i) && i != ' ')
{
fprintf(outfile, "%c ", i);
c += 2;
}
else
{
fprintf(outfile, "\\x%02x ", i);
c += 5;
}
}
}
fprintf(outfile, "\n");
}
fprintf(outfile, "\n");
}
}
}
}
/* If the '>' option was present, we write out the regex to a file, and
that is all. The first 8 bytes of the file are the regex length and then
the study length, in big-endian order. */
if (to_file != NULL)
{
FILE *f = fopen((char *)to_file, "wb");
if (f == NULL)
{
fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
}
else
{
uschar sbuf[8];
sbuf[0] = (true_size >> 24) & 255;
sbuf[1] = (true_size >> 16) & 255;
sbuf[2] = (true_size >> 8) & 255;
sbuf[3] = (true_size) & 255;
sbuf[4] = (true_study_size >> 24) & 255;
sbuf[5] = (true_study_size >> 16) & 255;
sbuf[6] = (true_study_size >> 8) & 255;
sbuf[7] = (true_study_size) & 255;
if (fwrite(sbuf, 1, 8, f) < 8 ||
fwrite(re, 1, true_size, f) < true_size)
{
fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
}
else
{
fprintf(outfile, "Compiled regex written to %s\n", to_file);
if (extra != NULL)
{
if (fwrite(extra->study_data, 1, true_study_size, f) <
true_study_size)
{
fprintf(outfile, "Write error on %s: %s\n", to_file,
strerror(errno));
}
else fprintf(outfile, "Study data written to %s\n", to_file);
}
}
fclose(f);
}
continue; /* With next regex */
}
} /* End of non-POSIX compile */
/* Read data lines and test them */
@@ -1045,10 +1328,14 @@ while (!done)
}
break;
case 0: /* Allows for an empty line */
case 0: /* \ followed by EOF allows for an empty line */
p--;
continue;
case '>':
while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
continue;
case 'A': /* Option setting */
options |= PCRE_ANCHORED;
continue;
@@ -1159,6 +1446,10 @@ while (!done)
if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
continue;
case 'P':
options |= PCRE_PARTIAL;
continue;
case 'S':
show_malloc = 1;
continue;
@@ -1269,7 +1560,8 @@ while (!done)
min = mid;
mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
}
else if (count >= 0 || count == PCRE_ERROR_NOMATCH)
else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
count == PCRE_ERROR_PARTIAL)
{
if (mid == min + 1)
{
@@ -1305,8 +1597,11 @@ while (!done)
/* The normal case is just to do the match once, with the default
value of match_limit. */
else count = pcre_exec(re, extra, (char *)bptr, len,
start_offset, options | g_notempty, use_offsets, use_size_offsets);
else
{
count = pcre_exec(re, extra, (char *)bptr, len,
start_offset, options | g_notempty, use_offsets, use_size_offsets);
}
if (count == 0)
{
@@ -1393,6 +1688,14 @@ while (!done)
}
}
/* There was a partial match */
else if (count == PCRE_ERROR_PARTIAL)
{
fprintf(outfile, "Partial match\n");
break; /* Out of the /g loop */
}
/* Failed to match. If this is a /g or /G loop and we previously set
g_notempty after a null match, this is not necessarily the end.
We want to advance the start offset, and continue. In the case of UTF-8
+37 -25
View File
@@ -12,23 +12,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
@@ -57,7 +65,7 @@ Returns: nothing
*/
static void
set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
{
start_bits[c/8] |= (1 << (c&7));
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
@@ -123,7 +131,7 @@ do
/* Skip over callout */
case OP_CALLOUT:
tcode += 2;
tcode += 2 + 2*LINK_SIZE;
break;
/* Skip over extended extraction bracket number */
@@ -186,11 +194,10 @@ do
/* At least one single char sets the bit and stops */
case OP_EXACT: /* Fall through */
tcode++;
case OP_CHARS: /* Fall through */
tcode++;
tcode += 2;
case OP_CHAR:
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
@@ -403,8 +410,9 @@ pcre_study(const pcre *external_re, int options, const char **errorptr)
uschar start_bits[32];
pcre_extra *extra;
pcre_study_data *study;
const uschar *tables;
const real_pcre *re = (const real_pcre *)external_re;
uschar *code = (uschar *)re + sizeof(real_pcre) +
uschar *code = (uschar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
compile_data compile_block;
@@ -429,12 +437,16 @@ at present. */
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
return NULL;
/* Set the character tables in the block which is passed around */
/* Set the character tables in the block that is passed around */
compile_block.lcc = re->tables + lcc_offset;
compile_block.fcc = re->tables + fcc_offset;
compile_block.cbits = re->tables + cbits_offset;
compile_block.ctypes = re->tables + ctypes_offset;
tables = re->tables;
if (tables == NULL)
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
compile_block.cbits = tables + cbits_offset;
compile_block.ctypes = tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
+151
View File
@@ -0,0 +1,151 @@
/*************************************************
* libucp - Unicode Property Table handler *
*************************************************/
/* This function provides a fast way of obtaining the basic Unicode properties
of a character, using a compact binary tree that occupies less than 100K bytes.
Copyright (c) 2004 University of Cambridge
-------------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-------------------------------------------------------------------------------
*/
#include "ucp.h" /* Exported interface */
#include "ucpinternal.h" /* Internal table details */
#include "ucptable.c" /* The table itself */
/*************************************************
* Search table and return data *
*************************************************/
/* Two values are returned: the category is ucp_C, ucp_L, etc. The detailed
character type is ucp_Lu, ucp_Nd, etc.
Arguments:
c the character value
type_ptr the detailed character type is returned here
case_ptr for letters, the opposite case is returned here, if there
is one, else zero
Returns: the character type category or -1 if not found
*/
static int
ucp_findchar(const int c, int *type_ptr, int *case_ptr)
{
cnode *node = ucp_table;
register int cc = c;
int case_offset;
for (;;)
{
register int d = node->f1 | ((node->f0 & f0_chhmask) << 16);
if (cc == d) break;
if (cc < d)
{
if ((node->f0 & f0_leftexists) == 0) return -1;
node ++;
}
else
{
register int roffset = (node->f2 & f2_rightmask) >> f2_rightshift;
if (roffset == 0) return -1;
node += 1 << (roffset - 1);
}
}
switch ((*type_ptr = ((node->f0 & f0_typemask) >> f0_typeshift)))
{
case ucp_Cc:
case ucp_Cf:
case ucp_Cn:
case ucp_Co:
case ucp_Cs:
return ucp_C;
break;
case ucp_Ll:
case ucp_Lu:
case_offset = node->f2 & f2_casemask;
if ((case_offset & 0x0100) != 0) case_offset |= 0xfffff000;
*case_ptr = (case_offset == 0)? 0 : cc + case_offset;
return ucp_L;
case ucp_Lm:
case ucp_Lo:
case ucp_Lt:
*case_ptr = 0;
return ucp_L;
break;
case ucp_Mc:
case ucp_Me:
case ucp_Mn:
return ucp_M;
break;
case ucp_Nd:
case ucp_Nl:
case ucp_No:
return ucp_N;
break;
case ucp_Pc:
case ucp_Pd:
case ucp_Pe:
case ucp_Pf:
case ucp_Pi:
case ucp_Ps:
case ucp_Po:
return ucp_P;
break;
case ucp_Sc:
case ucp_Sk:
case ucp_Sm:
case ucp_So:
return ucp_S;
break;
case ucp_Zl:
case ucp_Zp:
case ucp_Zs:
return ucp_Z;
break;
default: /* "Should never happen" */
return -1;
break;
}
}
/* End of ucp.c */
+58
View File
@@ -0,0 +1,58 @@
/*************************************************
* libucp - Unicode Property Table handler *
*************************************************/
/* These are the character categories that are returned by ucp_findchar */
enum {
ucp_C, /* Other */
ucp_L, /* Letter */
ucp_M, /* Mark */
ucp_N, /* Number */
ucp_P, /* Punctuation */
ucp_S, /* Symbol */
ucp_Z /* Separator */
};
/* These are the detailed character types that are returned by ucp_findchar */
enum {
ucp_Cc, /* Control */
ucp_Cf, /* Format */
ucp_Cn, /* Unassigned */
ucp_Co, /* Private use */
ucp_Cs, /* Surrogate */
ucp_Ll, /* Lower case letter */
ucp_Lm, /* Modifier letter */
ucp_Lo, /* Other letter */
ucp_Lt, /* Title case letter */
ucp_Lu, /* Upper case letter */
ucp_Mc, /* Spacing mark */
ucp_Me, /* Enclosing mark */
ucp_Mn, /* Non-spacing mark */
ucp_Nd, /* Decimal number */
ucp_Nl, /* Letter number */
ucp_No, /* Other number */
ucp_Pc, /* Connector punctuation */
ucp_Pd, /* Dash punctuation */
ucp_Pe, /* Close punctuation */
ucp_Pf, /* Final punctuation */
ucp_Pi, /* Initial punctuation */
ucp_Po, /* Other punctuation */
ucp_Ps, /* Open punctuation */
ucp_Sc, /* Currency symbol */
ucp_Sk, /* Modifier symbol */
ucp_Sm, /* Mathematical symbol */
ucp_So, /* Other symbol */
ucp_Zl, /* Line separator */
ucp_Zp, /* Paragraph separator */
ucp_Zs /* Space separator */
};
/* For use in PCRE we make this function static so that there is no conflict if
PCRE is linked with an application that makes use of an external version -
assuming an external version is ever released... */
static int ucp_findchar(const int, int *, int *);
/* End of ucp.h */
+91
View File
@@ -0,0 +1,91 @@
/*************************************************
* libucp - Unicode Property Table handler *
*************************************************/
/* Internal header file defining the layout of compact nodes in the tree. */
typedef struct cnode {
unsigned short int f0;
unsigned short int f1;
unsigned short int f2;
} cnode;
/* Things for the f0 field */
#define f0_leftexists 0x8000 /* Left child exists */
#define f0_typemask 0x3f00 /* Type bits */
#define f0_typeshift 8 /* Type shift */
#define f0_chhmask 0x00ff /* Character high bits */
/* Things for the f2 field */
#define f2_rightmask 0xf000 /* Mask for right offset bits */
#define f2_rightshift 12 /* Shift for right offset */
#define f2_casemask 0x0fff /* Mask for case offset */
/* The tree consists of a vector of structures of type cnode, with the root
node as the first element. The three short ints (16-bits) are used as follows:
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
is the next node in the vector.
(2) The 0x4000 bits of f0 is spare.
(3) The 0x3f00 bits of f0 contain the character type; this is a number
defined by the enumeration in ucp.h (e.g. ucp_Lu).
(4) The bottom 8 bits of f0 contain the most significant byte of the
character's 24-bit codepoint.
(f1) (1) The f1 field contains the two least significant bytes of the
codepoint.
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
node. Otherwise, they contain one plus the exponent of the power of
two of the offset to the right node (e.g. a value of 3 means 8). The
units of the offset are node items.
(2) The 0x0fff bits of f2 contain the signed offset from this character to
its alternate cased value. They are zero if there is no such
character.
-----------------------------------------------------------------------------
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
-----------------------------------------------------------------------------
| | |
| |-> spare |
| exponent of right
|-> left child exists child offset
The upper/lower casing information is set only for characters that come in
pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
These are ignored. They are:
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
2126 Ohm
212A Kelvin
212B Angstrom
Certainly for the last three, having an alternate case would seem to be a
mistake. I don't know any Greek, so cannot comment on the first one.
When searching the tree, proceed as follows:
(1) Start at the first node.
(2) Extract the character value from f1 and the bottom 8 bits of f0;
(3) Compare with the character being sought. If equal, we are done.
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
not set, the character is not in the tree. If it is set, move to the next
node, and go to (2).
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
shift them right by f2_rightshift. If the result is zero, the character is
not in the tree. Otherwise, calculate the number of nodes to skip by
shifting the value 1 left by this number minus one. Go to (2).
*/
/* End of internal.h */
File diff suppressed because it is too large Load Diff
+93
View File
@@ -0,0 +1,93 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/*
This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains a table for translating Unicode property names into
code values for the ucp_findchar function. It is in a separate module so that
it can be included both in the main pcre library, and into pcretest (for
printing out internals). */
typedef struct {
const char *name;
int value;
} ucp_type_table;
static ucp_type_table utt[] = {
{ "C", 128 + ucp_C },
{ "Cc", ucp_Cc },
{ "Cf", ucp_Cf },
{ "Cn", ucp_Cn },
{ "Co", ucp_Co },
{ "Cs", ucp_Cs },
{ "L", 128 + ucp_L },
{ "Ll", ucp_Ll },
{ "Lm", ucp_Lm },
{ "Lo", ucp_Lo },
{ "Lt", ucp_Lt },
{ "Lu", ucp_Lu },
{ "M", 128 + ucp_M },
{ "Mc", ucp_Mc },
{ "Me", ucp_Me },
{ "Mn", ucp_Mn },
{ "N", 128 + ucp_N },
{ "Nd", ucp_Nd },
{ "Nl", ucp_Nl },
{ "No", ucp_No },
{ "P", 128 + ucp_P },
{ "Pc", ucp_Pc },
{ "Pd", ucp_Pd },
{ "Pe", ucp_Pe },
{ "Pf", ucp_Pf },
{ "Pi", ucp_Pi },
{ "Po", ucp_Po },
{ "Ps", ucp_Ps },
{ "S", 128 + ucp_S },
{ "Sc", ucp_Sc },
{ "Sk", ucp_Sk },
{ "Sm", ucp_Sm },
{ "So", ucp_So },
{ "Z", 128 + ucp_Z },
{ "Zl", ucp_Zl },
{ "Zp", ucp_Zp },
{ "Zs", ucp_Zs }
};
/* End of ucptypetable.c */