mirror of
https://github.com/php/php-src.git
synced 2026-04-25 08:58:28 +02:00
Upgrade PCRE library to 5.0.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
PHP NEWS
|
||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||
?? ??? 2005, PHP 5.0.5
|
||||
- Upgraded PCRE library to version 5.0. (Andrei)
|
||||
- Removed php_check_syntax() function which never worked properly. (Ilia)
|
||||
- Added new function mysqli_set_charset(). (Georg)
|
||||
- Added man pages for "phpize" and "php-config" scripts. (Jakub Vrana)
|
||||
|
||||
+2
-2
@@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,
|
||||
|
||||
if test "$PHP_PCRE_REGEX" != "no"; then
|
||||
if test "$PHP_PCRE_REGEX" = "yes"; then
|
||||
PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -I@ext_srcdir@/pcrelib)
|
||||
PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -I@ext_srcdir@/pcrelib)
|
||||
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
|
||||
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
|
||||
else
|
||||
@@ -50,7 +50,7 @@ if test "$PHP_PCRE_REGEX" != "no"; then
|
||||
|
||||
AC_DEFINE(HAVE_PCRE, 1, [ ])
|
||||
PHP_ADD_INCLUDE($PCRE_INCDIR)
|
||||
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10)
|
||||
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000)
|
||||
fi
|
||||
PHP_SUBST(PCRE_SHARED_LIBADD)
|
||||
fi
|
||||
|
||||
@@ -3,4 +3,4 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
+27
-36
@@ -4,51 +4,42 @@ PCRE LICENCE
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission. In practice, this means that if you use
|
||||
PCRE in software that you distribute to others, commercially or
|
||||
otherwise, you must put a sentence like this
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
Regular expression support is provided by the PCRE library package,
|
||||
which is open source software, written by Philip Hazel, and copyright
|
||||
by the University of Cambridge, England.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
somewhere reasonably visible in your documentation and in any relevant
|
||||
files or online help data or similar. A reference to the ftp site for
|
||||
the source, that is, to
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
|
||||
|
||||
should also be given in the documentation. However, this condition is not
|
||||
intended to apply to whole chains of software. If package A includes PCRE,
|
||||
it must acknowledge it, but if package B is software that includes package
|
||||
A, the condition is not imposed on package B (unless it uses PCRE
|
||||
independently).
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
|
||||
then the terms of that licence shall supersede any condition above with
|
||||
which it is incompatible.
|
||||
|
||||
The documentation for PCRE, supplied in the "doc" directory, is distributed
|
||||
under the same terms as the software itself.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
End
|
||||
|
||||
@@ -1,6 +1,142 @@
|
||||
ChangeLog for PCRE
|
||||
------------------
|
||||
|
||||
Version 5.0 13-Sep-04
|
||||
---------------------
|
||||
|
||||
1. Internal change: literal characters are no longer packed up into items
|
||||
containing multiple characters in a single byte-string. Each character
|
||||
is now matched using a separate opcode. However, there may be more than one
|
||||
byte in the character in UTF-8 mode.
|
||||
|
||||
2. The pcre_callout_block structure has two new fields: pattern_position and
|
||||
next_item_length. These contain the offset in the pattern to the next match
|
||||
item, and its length, respectively.
|
||||
|
||||
3. The PCRE_AUTO_CALLOUT option for pcre_compile() requests the automatic
|
||||
insertion of callouts before each pattern item. Added the /C option to
|
||||
pcretest to make use of this.
|
||||
|
||||
4. On the advice of a Windows user, the lines
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
_setmode( _fileno( stdout ), 0x8000 );
|
||||
#endif /* defined(_WIN32) || defined(WIN32) */
|
||||
|
||||
have been added to the source of pcretest. This apparently does useful
|
||||
magic in relation to line terminators.
|
||||
|
||||
5. Changed "r" and "w" in the calls to fopen() in pcretest to "rb" and "wb"
|
||||
for the benefit of those environments where the "b" makes a difference.
|
||||
|
||||
6. The icc compiler has the same options as gcc, but "configure" doesn't seem
|
||||
to know about it. I have put a hack into configure.in that adds in code
|
||||
to set GCC=yes if CC=icc. This seems to end up at a point in the
|
||||
generated configure script that is early enough to affect the setting of
|
||||
compiler options, which is what is needed, but I have no means of testing
|
||||
whether it really works. (The user who reported this had patched the
|
||||
generated configure script, which of course I cannot do.)
|
||||
|
||||
LATER: After change 22 below (new libtool files), the configure script
|
||||
seems to know about icc (and also ecc). Therefore, I have commented out
|
||||
this hack in configure.in.
|
||||
|
||||
7. Added support for pkg-config (2 patches were sent in).
|
||||
|
||||
8. Negated POSIX character classes that used a combination of internal tables
|
||||
were completely broken. These were [[:^alpha:]], [[:^alnum:]], and
|
||||
[[:^ascii]]. Typically, they would match almost any characters. The other
|
||||
POSIX classes were not broken in this way.
|
||||
|
||||
9. Matching the pattern "\b.*?" against "ab cd", starting at offset 1, failed
|
||||
to find the match, as PCRE was deluded into thinking that the match had to
|
||||
start at the start point or following a newline. The same bug applied to
|
||||
patterns with negative forward assertions or any backward assertions
|
||||
preceding ".*" at the start, unless the pattern required a fixed first
|
||||
character. This was a failing pattern: "(?!.bcd).*". The bug is now fixed.
|
||||
|
||||
10. In UTF-8 mode, when moving forwards in the subject after a failed match
|
||||
starting at the last subject character, bytes beyond the end of the subject
|
||||
string were read.
|
||||
|
||||
11. Renamed the variable "class" as "classbits" to make life easier for C++
|
||||
users. (Previously there was a macro definition, but it apparently wasn't
|
||||
enough.)
|
||||
|
||||
12. Added the new field "tables" to the extra data so that tables can be passed
|
||||
in at exec time, or the internal tables can be re-selected. This allows
|
||||
a compiled regex to be saved and re-used at a later time by a different
|
||||
program that might have everything at different addresses.
|
||||
|
||||
13. Modified the pcre-config script so that, when run on Solaris, it shows a
|
||||
-R library as well as a -L library.
|
||||
|
||||
14. The debugging options of pcretest (-d on the command line or D on a
|
||||
pattern) showed incorrect output for anything following an extended class
|
||||
that contained multibyte characters and which was followed by a quantifier.
|
||||
|
||||
15. Added optional support for general category Unicode character properties
|
||||
via the \p, \P, and \X escapes. Unicode property support implies UTF-8
|
||||
support. It adds about 90K to the size of the library. The meanings of the
|
||||
inbuilt class escapes such as \d and \s have NOT been changed.
|
||||
|
||||
16. Updated pcredemo.c to include calls to free() to release the memory for the
|
||||
compiled pattern.
|
||||
|
||||
17. The generated file chartables.c was being created in the source directory
|
||||
instead of in the building directory. This caused the build to fail if the
|
||||
source directory was different from the building directory, and was
|
||||
read-only.
|
||||
|
||||
18. Added some sample Win commands from Mark Tetrode into the NON-UNIX-USE
|
||||
file. No doubt somebody will tell me if they don't make sense... Also added
|
||||
Dan Mooney's comments about building on OpenVMS.
|
||||
|
||||
19. Added support for partial matching via the PCRE_PARTIAL option for
|
||||
pcre_exec() and the \P data escape in pcretest.
|
||||
|
||||
20. Extended pcretest with 3 new pattern features:
|
||||
|
||||
(i) A pattern option of the form ">rest-of-line" causes pcretest to
|
||||
write the compiled pattern to the file whose name is "rest-of-line".
|
||||
This is a straight binary dump of the data, with the saved pointer to
|
||||
the character tables forced to be NULL. The study data, if any, is
|
||||
written too. After writing, pcretest reads a new pattern.
|
||||
|
||||
(ii) If, instead of a pattern, "<rest-of-line" is given, pcretest reads a
|
||||
compiled pattern from the given file. There must not be any
|
||||
occurrences of "<" in the file name (pretty unlikely); if there are,
|
||||
pcretest will instead treat the initial "<" as a pattern delimiter.
|
||||
After reading in the pattern, pcretest goes on to read data lines as
|
||||
usual.
|
||||
|
||||
(iii) The F pattern option causes pcretest to flip the bytes in the 32-bit
|
||||
and 16-bit fields in a compiled pattern, to simulate a pattern that
|
||||
was compiled on a host of opposite endianness.
|
||||
|
||||
21. The pcre-exec() function can now cope with patterns that were compiled on
|
||||
hosts of opposite endianness, with this restriction:
|
||||
|
||||
As for any compiled expression that is saved and used later, the tables
|
||||
pointer field cannot be preserved; the extra_data field in the arguments
|
||||
to pcre_exec() should be used to pass in a tables address if a value
|
||||
other than the default internal tables were used at compile time.
|
||||
|
||||
22. Calling pcre_exec() with a negative value of the "ovecsize" parameter is
|
||||
now diagnosed as an error. Previously, most of the time, a negative number
|
||||
would have been treated as zero, but if in addition "ovector" was passed as
|
||||
NULL, a crash could occur.
|
||||
|
||||
23. Updated the files ltmain.sh, config.sub, config.guess, and aclocal.m4 with
|
||||
new versions from the libtool 1.5 distribution (the last one is a copy of
|
||||
a file called libtool.m4). This seems to have fixed the need to patch
|
||||
"configure" to support Darwin 1.3 (which I used to do). However, I still
|
||||
had to patch ltmain.sh to ensure that ${SED} is set (it isn't on my
|
||||
workstation).
|
||||
|
||||
24. Changed the PCRE licence to be the more standard "BSD" licence.
|
||||
|
||||
|
||||
Version 4.5 01-Dec-03
|
||||
---------------------
|
||||
|
||||
|
||||
+27
-36
@@ -4,51 +4,42 @@ PCRE LICENCE
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission. In practice, this means that if you use
|
||||
PCRE in software that you distribute to others, commercially or
|
||||
otherwise, you must put a sentence like this
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
Regular expression support is provided by the PCRE library package,
|
||||
which is open source software, written by Philip Hazel, and copyright
|
||||
by the University of Cambridge, England.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
somewhere reasonably visible in your documentation and in any relevant
|
||||
files or online help data or similar. A reference to the ftp site for
|
||||
the source, that is, to
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
|
||||
|
||||
should also be given in the documentation. However, this condition is not
|
||||
intended to apply to whole chains of software. If package A includes PCRE,
|
||||
it must acknowledge it, but if package B is software that includes package
|
||||
A, the condition is not imposed on package B (unless it uses PCRE
|
||||
independently).
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
|
||||
then the terms of that licence shall supersede any condition above with
|
||||
which it is incompatible.
|
||||
|
||||
The documentation for PCRE, supplied in the "doc" directory, is distributed
|
||||
under the same terms as the software itself.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
End
|
||||
|
||||
@@ -1,6 +1,53 @@
|
||||
News about PCRE releases
|
||||
------------------------
|
||||
|
||||
Release 5.0 13-Sep-04
|
||||
---------------------
|
||||
|
||||
The licence under which PCRE is released has been changed to the more
|
||||
conventional "BSD" licence.
|
||||
|
||||
In the code, some bugs have been fixed, and there are also some major changes
|
||||
in this release (which is why I've increased the number to 5.0). Some changes
|
||||
are internal rearrangements, and some provide a number of new facilities. The
|
||||
new features are:
|
||||
|
||||
1. There's an "automatic callout" feature that inserts callouts before every
|
||||
item in the regex, and there's a new callout field that gives the position
|
||||
in the pattern - useful for debugging and tracing.
|
||||
|
||||
2. The extra_data structure can now be used to pass in a set of character
|
||||
tables at exec time. This is useful if compiled regex are saved and re-used
|
||||
at a later time when the tables may not be at the same address. If the
|
||||
default internal tables are used, the pointer saved with the compiled
|
||||
pattern is now set to NULL, which means that you don't need to do anything
|
||||
special unless you are using custom tables.
|
||||
|
||||
3. It is possible, with some restrictions on the content of the regex, to
|
||||
request "partial" matching. A special return code is given if all of the
|
||||
subject string matched part of the regex. This could be useful for testing
|
||||
an input field as it is being typed.
|
||||
|
||||
4. There is now some optional support for Unicode character properties, which
|
||||
means that the patterns items such as \p{Lu} and \X can now be used. Only
|
||||
the general category properties are supported. If PCRE is compiled with this
|
||||
support, an additional 90K data structure is include, which increases the
|
||||
size of the library dramatically.
|
||||
|
||||
5. There is support for saving compiled patterns and re-using them later.
|
||||
|
||||
6. There is support for running regular expressions that were compiled on a
|
||||
different host with the opposite endianness.
|
||||
|
||||
7. The pcretest program has been extended to accommodate the new features.
|
||||
|
||||
The main internal rearrangement is that sequences of literal characters are no
|
||||
longer handled as strings. Instead, each character is handled on its own. This
|
||||
makes some UTF-8 handling easier, and makes the support of partial matching
|
||||
possible. Compiled patterns containing long literal strings will be larger as a
|
||||
result of this change; I hope that performance will not be much affected.
|
||||
|
||||
|
||||
Release 4.5 01-Dec-03
|
||||
---------------------
|
||||
|
||||
|
||||
+132
-10
@@ -1,19 +1,25 @@
|
||||
Compiling PCRE on non-Unix systems
|
||||
----------------------------------
|
||||
|
||||
See below for comments on Cygwin or MinGW usage. I (Philip Hazel) have no
|
||||
knowledge of Windows sytems and how their libraries work. The items in the
|
||||
PCRE Makefile that relate to anything other than Unix-like systems have been
|
||||
contributed by PCRE users. There are some other comments and files in the
|
||||
Contrib directory on the ftp site that you may find useful.
|
||||
See below for comments on Cygwin or MinGW and OpenVMS usage. I (Philip Hazel)
|
||||
have no knowledge of Windows or VMS sytems and how their libraries work. The
|
||||
items in the PCRE Makefile that relate to anything other than Unix-like systems
|
||||
have been contributed by PCRE users. There are some other comments and files in
|
||||
the Contrib directory on the ftp site that you may find useful. See
|
||||
|
||||
The following are generic comments about building PCRE:
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
|
||||
|
||||
If you want to compile PCRE for a non-Unix system (or perhaps, more strictly,
|
||||
for a system that does not support "configure" and make files), note that PCRE
|
||||
consists entirely of code written in Standard C, and so should compile
|
||||
successfully on any machine with a Standard C compiler and library, using
|
||||
normal compiling commands to do the following:
|
||||
for a system that does not support "configure" and "make" files), note that
|
||||
PCRE consists entirely of code written in Standard C, and so should compile
|
||||
successfully on any system that has a Standard C compiler and library.
|
||||
|
||||
|
||||
GENERIC INSTRUCTIONS
|
||||
|
||||
The following are generic comments about building PCRE. The interspersed
|
||||
indented commands are suggestions from Mark Tetrode as to which commands you
|
||||
might use on a Windows system to build a static library.
|
||||
|
||||
(1) Copy or rename the file config.in as config.h, and change the macros that
|
||||
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
|
||||
@@ -23,32 +29,85 @@ particular, if you want to force a specific value for newline, you can define
|
||||
the NEWLINE macro. The default is to use '\n', thereby using whatever value
|
||||
your compiler gives to '\n'.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
copy config.in config.h
|
||||
rem Use write, because notepad cannot handle UNIX files. Change values.
|
||||
write config.h
|
||||
|
||||
(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
|
||||
for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
|
||||
configure.in.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
copy pcre.in pcre.h
|
||||
rem Read values from configure.in
|
||||
write configure.in
|
||||
rem Change values
|
||||
write pcre.h
|
||||
|
||||
(3) Compile dftables.c as a stand-alone program, and then run it with
|
||||
the single argument "chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
rem Compile & run
|
||||
cl -DSUPPORT_UTF8 dftables.c
|
||||
dftables.exe > chartables.c
|
||||
|
||||
(4) Compile maketables.c, get.c, study.c and pcre.c and link them all
|
||||
together into an object library in whichever form your system keeps such
|
||||
libraries. This is the pcre library (chartables.c is included by means of an
|
||||
#include directive). If your system has static and shared libraries, you may
|
||||
have to do this once for each type.
|
||||
|
||||
rem Mark Tetrode's commands, for a static library
|
||||
rem Compile & lib
|
||||
cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c
|
||||
lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj
|
||||
|
||||
(5) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix
|
||||
library.
|
||||
|
||||
rem Mark Tetrode's commands, for a static library
|
||||
rem Compile & lib
|
||||
cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c
|
||||
lib /OUT:pcreposix.lib pcreposix.obj
|
||||
|
||||
(6) Compile the test program pcretest.c. This needs the functions in the
|
||||
pcre and pcreposix libraries when linking.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
rem compile & link
|
||||
cl pcretest.c pcre.lib pcreposix.lib
|
||||
|
||||
(7) Run pcretest on the testinput files in the testdata directory, and check
|
||||
that the output matches the corresponding testoutput files. You must use the
|
||||
-i option when checking testinput2. Note that the supplied files are in Unix
|
||||
format, with just LF characters as line terminators. You may need to edit them
|
||||
to change this if your system uses a different convention.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
rem Make a change, i.e. space, backspace, and save again - do this for all
|
||||
rem to change UNIX to Win, \n to \n\r
|
||||
write testoutput1
|
||||
write testoutput2
|
||||
write testoutput3
|
||||
write testoutput4
|
||||
write testoutput5
|
||||
pcretest testdata\testinput1 testdata\myoutput1
|
||||
windiff testdata\testoutput1 testdata\myoutput1
|
||||
pcretest -i testdata\testinput2 testdata\myoutput2
|
||||
windiff testdata\testoutput2 testdata\myoutput2
|
||||
pcretest testdata\testinput3 testdata\myoutput3
|
||||
windiff testdata\testoutput3 testdata\myoutput3
|
||||
pcretest testdata\testinput4 testdata\myoutput4
|
||||
windiff testdata\testoutput4 testdata\myoutput4
|
||||
pcretest testdata\testinput5 testdata\myoutput5
|
||||
windiff testdata\testoutput5 testdata\myoutput5
|
||||
|
||||
|
||||
FURTHER REMARKS
|
||||
|
||||
If you have a system without "configure" but where you can use a Makefile, edit
|
||||
Makefile.in to create Makefile, substituting suitable values for the variables
|
||||
at the head of the file.
|
||||
@@ -119,4 +178,67 @@ void (*pcre_free)(void *) = free;
|
||||
#endif
|
||||
=========================
|
||||
|
||||
|
||||
BUILDING PCRE ON OPENVMS
|
||||
|
||||
Dan Mooney sent the following comments about building PCRE on OpenVMS:
|
||||
|
||||
"It was quite easy to compile and link the library. I don't have a formal
|
||||
make file but the attached file [reproduced below] contains the OpenVMS DCL
|
||||
commands I used to build the library. I had to add #define
|
||||
POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
|
||||
|
||||
The library was built on:
|
||||
O/S: HP OpenVMS v7.3-1
|
||||
Compiler: Compaq C v6.5-001-48BCD
|
||||
Linker: vA13-01
|
||||
|
||||
The test results did not match 100% due to the issues you mention in your
|
||||
documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
|
||||
modified some of the character tables temporarily and was able to get the
|
||||
results to match. Tests using the fr locale did not match since I don't have
|
||||
that locale loaded. The study size was always reported to be 3 less than the
|
||||
value in the standard test output files."
|
||||
|
||||
=========================
|
||||
$! This DCL procedure builds PCRE on OpenVMS
|
||||
$!
|
||||
$! I followed the instructions in the non-unix-use file in the distribution.
|
||||
$!
|
||||
$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
|
||||
$ COMPILE DFTABLES.C
|
||||
$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
|
||||
$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
|
||||
$ COMPILE MAKETABLES.C
|
||||
$ COMPILE GET.C
|
||||
$ COMPILE STUDY.C
|
||||
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
|
||||
$! did not seem to be defined anywhere.
|
||||
$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
|
||||
$ COMPILE PCRE.C
|
||||
$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
|
||||
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
|
||||
$! did not seem to be defined anywhere.
|
||||
$ COMPILE PCREPOSIX.C
|
||||
$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
|
||||
$ COMPILE PCRETEST.C
|
||||
$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
|
||||
$! C programs that want access to command line arguments must be
|
||||
$! defined as a symbol
|
||||
$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
|
||||
$! Arguments must be enclosed in quotes.
|
||||
$ PCRETEST "-C"
|
||||
$! Test results:
|
||||
$!
|
||||
$! The test results did not match 100%. The functions isprint(), iscntrl(),
|
||||
$! isgraph() and ispunct() on OpenVMS must not produce the same results
|
||||
$! as the system that built the test output files provided with the
|
||||
$! distribution.
|
||||
$!
|
||||
$! The study size did not match and was always 3 less on OpenVMS.
|
||||
$!
|
||||
$! Locale could not be set to fr
|
||||
$!
|
||||
=========================
|
||||
|
||||
****
|
||||
|
||||
+82
-20
@@ -22,6 +22,28 @@ ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
|
||||
up the "real" POSIX functions of the same name.
|
||||
|
||||
|
||||
Documentation for PCRE
|
||||
----------------------
|
||||
|
||||
If you install PCRE in the normal way, you will end up with an installed set of
|
||||
man pages whose names all start with "pcre". The one that is called "pcre"
|
||||
lists all the others. In addition to these man pages, the PCRE documentation is
|
||||
supplied in two other forms; however, as there is no standard place to install
|
||||
them, they are left in the doc directory of the unpacked source distribution.
|
||||
These forms are:
|
||||
|
||||
1. Files called doc/pcre.txt, doc/pcregrep.txt, and doc/pcretest.txt. The
|
||||
first of these is a concatenation of the text forms of all the section 3
|
||||
man pages except those that summarize individual functions. The other two
|
||||
are the text forms of the section 1 man pages for the pcregrep and
|
||||
pcretest commands. Text forms are provided for ease of scanning with text
|
||||
editors or similar tools.
|
||||
|
||||
2. A subdirectory called doc/html contains all the documentation in HTML
|
||||
form, hyperlinked in various ways, and rooted in a file called
|
||||
doc/index.html.
|
||||
|
||||
|
||||
Contributions by users of PCRE
|
||||
------------------------------
|
||||
|
||||
@@ -46,7 +68,7 @@ INSTALL.
|
||||
|
||||
Most commonly, people build PCRE within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient, but the
|
||||
usual methods of changing standard defaults are available. For example,
|
||||
usual methods of changing standard defaults are available. For example:
|
||||
|
||||
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
|
||||
|
||||
@@ -69,6 +91,13 @@ library. You can read more about them in the pcrebuild man page.
|
||||
for handling UTF-8 is not included in the library. (Even when included, it
|
||||
still has to be enabled by an option at run time.)
|
||||
|
||||
. If, in addition to support for UTF-8 character strings, you want to include
|
||||
support for the \P, \p, and \X sequences that recognize Unicode character
|
||||
properties, you must add --enable-unicode-properties to the "configure"
|
||||
command. This adds about 90K to the size of the library (in the form of a
|
||||
property table); only the basic two-letter properties such as Lu are
|
||||
supported.
|
||||
|
||||
. You can build PCRE to recognized CR or NL as the newline character, instead
|
||||
of whatever your compiler uses for "\n", by adding --newline-is-cr or
|
||||
--newline-is-nl to the "configure" command, respectively. Only do this if you
|
||||
@@ -111,12 +140,14 @@ library. You can read more about them in the pcrebuild man page.
|
||||
on the "configure" command. PCRE runs more slowly in this mode, but it may be
|
||||
necessary in environments with limited stack sizes.
|
||||
|
||||
The "configure" script builds five files:
|
||||
The "configure" script builds seven files:
|
||||
|
||||
. libtool is a script that builds shared and/or static libraries
|
||||
. pcre.h is build by copying pcre.in and making substitutions
|
||||
. Makefile is built by copying Makefile.in and making substitutions.
|
||||
. config.h is built by copying config.in and making substitutions.
|
||||
. pcre-config is built by copying pcre-config.in and making substitutions.
|
||||
. libpcre.pc is data for the pkg-config command, built from libpcre.pc.in
|
||||
. libtool is a script that builds shared and/or static libraries
|
||||
. RunTest is a script for running tests
|
||||
|
||||
Once "configure" has run, you can run "make". It builds two libraries called
|
||||
@@ -125,20 +156,33 @@ command. You can use "make install" to copy these, the public header files
|
||||
pcre.h and pcreposix.h, and the man pages to appropriate live directories on
|
||||
your system, in the normal way.
|
||||
|
||||
|
||||
Retrieving configuration information on Unix-like systems
|
||||
---------------------------------------------------------
|
||||
|
||||
Running "make install" also installs the command pcre-config, which can be used
|
||||
to recall information about the PCRE configuration and installation. For
|
||||
example,
|
||||
example:
|
||||
|
||||
pcre-config --version
|
||||
|
||||
prints the version number, and
|
||||
|
||||
pcre-config --libs
|
||||
pcre-config --libs
|
||||
|
||||
outputs information about where the library is installed. This command can be
|
||||
included in makefiles for programs that use PCRE, saving the programmer from
|
||||
having to remember too many details.
|
||||
|
||||
The pkg-config command is another system for saving and retrieving information
|
||||
about installed libraries. Instead of separate commands for each library, a
|
||||
single command is used. For example:
|
||||
|
||||
pkg-config --cflags pcre
|
||||
|
||||
The data is held in *.pc files that are installed in a directory called
|
||||
pkgconfig.
|
||||
|
||||
|
||||
Shared libraries on Unix-like systems
|
||||
-------------------------------------
|
||||
@@ -158,7 +202,7 @@ installed themselves. However, the versions left in the source directory still
|
||||
use the uninstalled libraries.
|
||||
|
||||
To build PCRE using static libraries only you must use --disable-shared when
|
||||
configuring it. For example
|
||||
configuring it. For example:
|
||||
|
||||
./configure --prefix=/usr/gnu --disable-shared
|
||||
|
||||
@@ -202,9 +246,9 @@ configuring process. (This can also be run by "make runtest", "make check", or
|
||||
The script runs the pcretest test program (which is documented in its own man
|
||||
page) on each of the testinput files (in the testdata directory) in turn,
|
||||
and compares the output with the contents of the corresponding testoutput file.
|
||||
A file called testtry is used to hold the output from pcretest. To run pcretest
|
||||
on just one of the test files, give its number as an argument to RunTest, for
|
||||
example:
|
||||
A file called testtry is used to hold the main output from pcretest
|
||||
(testsavedregex is also used as a working file). To run pcretest on just one of
|
||||
the test files, give its number as an argument to RunTest, for example:
|
||||
|
||||
RunTest 2
|
||||
|
||||
@@ -247,19 +291,23 @@ running "configure". This file can be also fed directly to the perltest script,
|
||||
provided you are running Perl 5.8 or higher. (For Perl 5.6, a small patch,
|
||||
commented in the script, can be be used.)
|
||||
|
||||
The fifth and final file tests error handling with UTF-8 encoding, and internal
|
||||
UTF-8 features of PCRE that are not relevant to Perl.
|
||||
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
|
||||
features of PCRE that are not relevant to Perl.
|
||||
|
||||
The sixth and final test checks the support for Unicode character properties.
|
||||
It it not run automatically unless PCRE is built with Unicode property support.
|
||||
To to this you must set --enable-unicode-properties when running "configure".
|
||||
|
||||
|
||||
Character tables
|
||||
----------------
|
||||
|
||||
PCRE uses four tables for manipulating and identifying characters. The final
|
||||
argument of the pcre_compile() function is a pointer to a block of memory
|
||||
containing the concatenated tables. A call to pcre_maketables() can be used to
|
||||
generate a set of tables in the current locale. If the final argument for
|
||||
pcre_compile() is passed as NULL, a set of default tables that is built into
|
||||
the binary is used.
|
||||
PCRE uses four tables for manipulating and identifying characters whose values
|
||||
are less than 256. The final argument of the pcre_compile() function is a
|
||||
pointer to a block of memory containing the concatenated tables. A call to
|
||||
pcre_maketables() can be used to generate a set of tables in the current
|
||||
locale. If the final argument for pcre_compile() is passed as NULL, a set of
|
||||
default tables that is built into the binary is used.
|
||||
|
||||
The source file called chartables.c contains the default set of tables. This is
|
||||
not supplied in the distribution, but is built by the program dftables
|
||||
@@ -299,12 +347,20 @@ The distribution should contain the following files:
|
||||
headers:
|
||||
|
||||
dftables.c auxiliary program for building chartables.c
|
||||
|
||||
get.c )
|
||||
maketables.c )
|
||||
study.c ) source of
|
||||
pcre.c ) the functions
|
||||
study.c ) source of the functions
|
||||
pcre.c ) in the library
|
||||
pcreposix.c )
|
||||
printint.c )
|
||||
|
||||
ucp.c )
|
||||
ucp.h ) source for the code that is used for
|
||||
ucpinternal.h ) Unicode property handling
|
||||
ucptable.c )
|
||||
ucptypetable.c )
|
||||
|
||||
pcre.in "source" for the header for the external API; pcre.h
|
||||
is built from this by "configure"
|
||||
pcreposix.h header for the external POSIX wrapper API
|
||||
@@ -335,7 +391,9 @@ The distribution should contain the following files:
|
||||
doc/pcretest.txt plain text documentation of test program
|
||||
doc/perltest.txt plain text documentation of Perl test program
|
||||
install-sh a shell script for installing files
|
||||
libpcre.pc.in "source" for libpcre.pc for pkg-config
|
||||
ltmain.sh file used to build a libtool script
|
||||
mkinstalldirs script for making install directories
|
||||
pcretest.c comprehensive test program
|
||||
pcredemo.c simple demonstration of coding calls to PCRE
|
||||
perltest Perl test program
|
||||
@@ -346,15 +404,19 @@ The distribution should contain the following files:
|
||||
testdata/testinput3 test data for locale-specific tests
|
||||
testdata/testinput4 test data for UTF-8 tests compatible with Perl
|
||||
testdata/testinput5 test data for other UTF-8 tests
|
||||
testdata/testinput6 test data for Unicode property support tests
|
||||
testdata/testoutput1 test results corresponding to testinput1
|
||||
testdata/testoutput2 test results corresponding to testinput2
|
||||
testdata/testoutput3 test results corresponding to testinput3
|
||||
testdata/testoutput4 test results corresponding to testinput4
|
||||
testdata/testoutput5 test results corresponding to testinput5
|
||||
testdata/testoutput6 test results corresponding to testinput6
|
||||
|
||||
(C) Auxiliary files for Win32 DLL
|
||||
|
||||
dll.mk
|
||||
libpcre.def
|
||||
libpcreposix.def
|
||||
pcre.def
|
||||
|
||||
(D) Auxiliary file for VPASCAL
|
||||
@@ -362,4 +424,4 @@ The distribution should contain the following files:
|
||||
makevp.bat
|
||||
|
||||
Philip Hazel <ph10@cam.ac.uk>
|
||||
December 2003
|
||||
September 2004
|
||||
|
||||
+21
-15
@@ -11,26 +11,32 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
See the file Tech.Notes for some information on the internals.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
Technical Notes about PCRE
|
||||
--------------------------
|
||||
|
||||
Historical note 1
|
||||
-----------------
|
||||
|
||||
Many years ago I implemented some regular expression functions to an algorithm
|
||||
suggested by Martin Richards. These were not Unix-like in form, and were quite
|
||||
restricted in what they could do by comparison with Perl. The interesting part
|
||||
@@ -9,12 +12,15 @@ form of an expression was known in advance. The code to apply an expression did
|
||||
not operate by backtracking, as the original Henry Spencer code and current
|
||||
Perl code does, but instead checked all possibilities simultaneously by keeping
|
||||
a list of current states and checking all of them as it advanced through the
|
||||
subject string. (In the terminology of Jeffrey Friedl's book, it was a "DFA
|
||||
algorithm".) When the pattern was all used up, all remaining states were
|
||||
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
|
||||
algorithm". When the pattern was all used up, all remaining states were
|
||||
possible matches, and the one matching the longest subset of the subject string
|
||||
was chosen. This did not necessarily maximize the individual wild portions of
|
||||
the pattern, as is expected in Unix and Perl-style regular expressions.
|
||||
|
||||
Historical note 2
|
||||
-----------------
|
||||
|
||||
By contrast, the code originally written by Henry Spencer and subsequently
|
||||
heavily modified for Perl actually compiles the expression twice: once in a
|
||||
dummy mode in order to find out how much store will be needed, and then for
|
||||
@@ -23,6 +29,9 @@ optionally, minimizing in Perl) the amount of the subject that matches
|
||||
individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
|
||||
terminology.
|
||||
|
||||
OK, here's the real stuff
|
||||
-------------------------
|
||||
|
||||
For the set of functions that forms PCRE (which are unrelated to those
|
||||
mentioned above), I tried at first to invent an algorithm that used an amount
|
||||
of store bounded by a multiple of the number of characters in the pattern, to
|
||||
@@ -38,8 +47,16 @@ got quite big anyway to handle all the Perl stuff.
|
||||
|
||||
The compiled form of a pattern is a vector of bytes, containing items of
|
||||
variable length. The first byte in an item is an opcode, and the length of the
|
||||
item is either implicit in the opcode or contained in the data bytes which
|
||||
follow it. A list of all the opcodes follows:
|
||||
item is either implicit in the opcode or contained in the data bytes that
|
||||
follow it.
|
||||
|
||||
In many cases below "two-byte" data values are specified. This is in fact just
|
||||
a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
|
||||
performance). This is necessary only when patterns whose compiled length is
|
||||
greater than 64K are going to be processed. In this description, we assume the
|
||||
"normal" compilation options.
|
||||
|
||||
A list of all the opcodes follows:
|
||||
|
||||
Opcodes with no following data
|
||||
------------------------------
|
||||
@@ -48,7 +65,7 @@ These items are all just one byte long
|
||||
|
||||
OP_END end of pattern
|
||||
OP_ANY match any character
|
||||
OP_ANYBYTE match any single byte, even in UTF-8 mode
|
||||
OP_ANYBYTE match any single byte, even in UTF-8 mode
|
||||
OP_SOD match start of data: \A
|
||||
OP_SOM, start of match (subject + offset): \G
|
||||
OP_CIRC ^ (start of data, or after \n in multiline)
|
||||
@@ -63,13 +80,14 @@ These items are all just one byte long
|
||||
OP_EODN match end of data or \n at end: \Z
|
||||
OP_EOD match end of data: \z
|
||||
OP_DOLL $ (end of data, or before \n in multiline)
|
||||
|
||||
OP_EXTUNI match an extended Unicode character
|
||||
|
||||
|
||||
Repeating single characters
|
||||
---------------------------
|
||||
|
||||
The common repeats (*, +, ?) when applied to a single character appear as
|
||||
two-byte items using the following opcodes:
|
||||
The common repeats (*, +, ?) when applied to a single character use the
|
||||
following opcodes:
|
||||
|
||||
OP_STAR
|
||||
OP_MINSTAR
|
||||
@@ -78,6 +96,7 @@ two-byte items using the following opcodes:
|
||||
OP_QUERY
|
||||
OP_MINQUERY
|
||||
|
||||
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
|
||||
Those with "MIN" in their name are the minimizing versions. Each is followed by
|
||||
the character that is to be repeated. Other repeats make use of
|
||||
|
||||
@@ -109,39 +128,52 @@ byte. The opcodes are:
|
||||
OP_TYPEEXACT
|
||||
|
||||
|
||||
Matching a character string
|
||||
Match by Unicode property
|
||||
-------------------------
|
||||
|
||||
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
|
||||
character by testing its Unicode property (the \p and \P escape sequences).
|
||||
Each is followed by a single byte that encodes the desired property value.
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by two
|
||||
bytes: OP_PROP or OP_NOTPROP and then the desired property value.
|
||||
|
||||
|
||||
Matching literal characters
|
||||
---------------------------
|
||||
|
||||
The OP_CHARS opcode is followed by a one-byte count and then that number of
|
||||
characters. If there are more than 255 characters in sequence, successive
|
||||
instances of OP_CHARS are used.
|
||||
The OP_CHAR opcode is followed by a single character that is to be matched
|
||||
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
|
||||
character may be more than one byte long. (Earlier versions of PCRE used
|
||||
multi-character strings, but this was changed to allow some new features to be
|
||||
added.)
|
||||
|
||||
|
||||
Character classes
|
||||
-----------------
|
||||
|
||||
If there is only one character, OP_CHARS is used for a positive class,
|
||||
and OP_NOT for a negative one (that is, for something like [^a]). However, in
|
||||
UTF-8 mode, this applies only to characters with values < 128, because OP_NOT
|
||||
is confined to single bytes.
|
||||
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
|
||||
class, and OP_NOT for a negative one (that is, for something like [^a]).
|
||||
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
|
||||
values < 128, because OP_NOT is confined to single bytes.
|
||||
|
||||
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
|
||||
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
|
||||
repeated positive single-character class.
|
||||
|
||||
When there's more than one character in a class and all the characters are less
|
||||
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
|
||||
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
|
||||
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
|
||||
bit for every character that is acceptable. The bits are counted from the least
|
||||
significant end of each byte.
|
||||
|
||||
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
|
||||
subject characters with values greater than 256 can be handled correctly. For
|
||||
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
|
||||
subject characters with values greater than 256 can be handled correctly. For
|
||||
OP_CLASS they don't match, whereas for OP_NCLASS they do.
|
||||
|
||||
For classes containing characters with values > 255, OP_XCLASS is used. It
|
||||
optionally uses a bit map (if any characters lie within it), followed by a list
|
||||
of pairs and single characters. There is a flag character than indicates
|
||||
of pairs and single characters. There is a flag character than indicates
|
||||
whether it's a positive or a negative class.
|
||||
|
||||
|
||||
@@ -192,14 +224,14 @@ the bracket itself. (They could have all been done like this, but I was making
|
||||
minimal changes.)
|
||||
|
||||
A bracket opcode is followed by two bytes which give the offset to the next
|
||||
alternative OP_ALT or, if there aren't any branches, to the matching KET
|
||||
alternative OP_ALT or, if there aren't any branches, to the matching OP_KET
|
||||
opcode. Each OP_ALT is followed by two bytes giving the offset to the next one,
|
||||
or to the KET opcode.
|
||||
or to the OP_KET opcode.
|
||||
|
||||
OP_KET is used for subpatterns that do not repeat indefinitely, while
|
||||
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
|
||||
maximally respectively. All three are followed by two bytes giving (as a
|
||||
positive number) the offset back to the matching BRA opcode.
|
||||
positive number) the offset back to the matching OP_BRA opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
|
||||
@@ -207,15 +239,14 @@ opcodes which tell the matcher that skipping this subpattern entirely is a
|
||||
valid branch.
|
||||
|
||||
A subpattern with an indefinite maximum repetition is replicated in the
|
||||
compiled data its minimum number of times (or once with a BRAZERO if the
|
||||
minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
|
||||
appropriate.
|
||||
compiled data its minimum number of times (or once with OP_BRAZERO if the
|
||||
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
|
||||
as appropriate.
|
||||
|
||||
A subpattern with a bounded maximum repetition is replicated in a nested
|
||||
fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
|
||||
each replication after the minimum, so that, for example, (abc){2,5} is
|
||||
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 99 and 200 bracket limits do
|
||||
not apply to these internally generated brackets.
|
||||
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
|
||||
before each replication after the minimum, so that, for example, (abc){2,5} is
|
||||
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
|
||||
|
||||
|
||||
Assertions
|
||||
@@ -260,8 +291,11 @@ from the start of the whole pattern.
|
||||
Callout
|
||||
-------
|
||||
|
||||
OP_CALLOUT is followed by one byte of data that holds a callout number in the
|
||||
range 0 to 255.
|
||||
OP_CALLOUT is followed by one byte of data that holds a callout number in the
|
||||
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
|
||||
cases there follows a two-byte value giving the offset in the pattern to the
|
||||
start of the following item, and another two-byte item giving the length of the
|
||||
next item.
|
||||
|
||||
|
||||
Changing options
|
||||
@@ -278,4 +312,4 @@ at compile time, and so does not cause anything to be put into the compiled
|
||||
data.
|
||||
|
||||
Philip Hazel
|
||||
August 2003
|
||||
September 2004
|
||||
|
||||
+1509
-909
File diff suppressed because it is too large
Load Diff
+23
-15
@@ -9,32 +9,40 @@ the file Tech.Notes for some information on the internals.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains some convenience functions for extracting substrings
|
||||
from the subject string after a regex match has succeeded. The original idea
|
||||
for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
|
||||
for these functions came from Scott Wimer. */
|
||||
|
||||
|
||||
/* Include the internals header, which itself includes Standard C headers plus
|
||||
|
||||
+221
-158
@@ -5,30 +5,38 @@
|
||||
|
||||
/* This is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language. See
|
||||
the file Tech.Notes for some information on the internals.
|
||||
the file doc/Tech.Notes for some information on the internals.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
@@ -45,6 +53,18 @@ modules, but which are not relevant to the outside. */
|
||||
# include <php_config.h>
|
||||
#endif
|
||||
|
||||
/* Standard C headers plus the external interface definition. The only time
|
||||
setjmp and stdarg are used is when NO_RECURSE is set. */
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <setjmp.h>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef PCRE_SPY
|
||||
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
|
||||
#endif
|
||||
@@ -57,24 +77,45 @@ On Unix systems, "configure" can be used to override this default. */
|
||||
#define NEWLINE '\n'
|
||||
#endif
|
||||
|
||||
/* The value of MATCH_LIMIT determines the default number of times the match()
|
||||
function can be called during a single execution of pcre_exec(). (There is a
|
||||
runtime method of setting a different limit.) The limit exists in order to
|
||||
catch runaway regular expressions that take for ever to determine that they do
|
||||
not match. The default is set very large so that it does not accidentally catch
|
||||
legitimate cases. On Unix systems, "configure" can be used to override this
|
||||
default default. */
|
||||
|
||||
#ifndef MATCH_LIMIT
|
||||
#define MATCH_LIMIT 10000000
|
||||
#endif
|
||||
|
||||
/* If you are compiling for a system that needs some magic to be inserted
|
||||
* before the definition of an exported function, define this macro to contain
|
||||
* the relevant magic. It apears at the start of every exported function. */
|
||||
|
||||
#define EXPORT
|
||||
|
||||
/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
|
||||
cannot determine these outside the compilation (e.g. by running a program as
|
||||
part of "configure") because PCRE is often cross-compiled for use on other
|
||||
systems. Instead we make use of the maximum sizes that are available at
|
||||
preprocessor time in standard C environments. */
|
||||
|
||||
#if USHRT_MAX == 65535
|
||||
typedef unsigned short pcre_uint16;
|
||||
#elif UINT_MAX == 65535
|
||||
typedef unsigned int pcre_uint16;
|
||||
#else
|
||||
#error Cannot determine a type for 16-bit unsigned integers
|
||||
#endif
|
||||
|
||||
#if UINT_MAX == 4294967295
|
||||
typedef unsigned int pcre_uint32;
|
||||
#elif ULONG_MAX == 4294967295
|
||||
typedef unsigned long int pcre_uint32;
|
||||
#else
|
||||
#error Cannot determine a type for 32-bit unsigned integers
|
||||
#endif
|
||||
|
||||
/* All character handling must be done as unsigned characters. Otherwise there
|
||||
are problems with top-bit-set characters and functions such as isspace().
|
||||
However, we leave the interface to the outside world as char *, because that
|
||||
should make things easier for callers. We define a short type for unsigned char
|
||||
to save lots of typing. I tried "uchar", but it causes problems on Digital
|
||||
Unix, where it is defined in sys/types, so use "uschar" instead. */
|
||||
|
||||
typedef unsigned char uschar;
|
||||
|
||||
/* Include the public PCRE header */
|
||||
|
||||
#include "pcre.h"
|
||||
|
||||
/* When compiling for use with the Virtual Pascal compiler, these functions
|
||||
@@ -95,18 +136,6 @@ neither (there some non-Unix environments where this is the case). This assumes
|
||||
that all calls to memmove are moving strings upwards in store, which is the
|
||||
case in PCRE. */
|
||||
|
||||
/* Standard C headers plus the external interface definition. The only time
|
||||
setjmp and stdarg are used is when NO_RECURSE is set. */
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <setjmp.h>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#if ! HAVE_MEMMOVE
|
||||
#undef memmove /* some systems may have a macro */
|
||||
#if HAVE_BCOPY
|
||||
@@ -126,13 +155,14 @@ for (i = 0; i < n; ++i) *(--dest) = *(--src);
|
||||
#endif /* not VPCOMPAT */
|
||||
|
||||
|
||||
/* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
|
||||
These are used, for example, to link from the start of a subpattern to its
|
||||
alternatives and its end. The use of 2 bytes per offset limits the size of the
|
||||
compiled regex to around 64K, which is big enough for almost everybody.
|
||||
However, I received a request for an even bigger limit. For this reason, and
|
||||
also to make the code easier to maintain, the storing and loading of offsets
|
||||
from the byte string is now handled by the macros that are defined here.
|
||||
/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
|
||||
in big-endian order) by default. These are used, for example, to link from the
|
||||
start of a subpattern to its alternatives and its end. The use of 2 bytes per
|
||||
offset limits the size of the compiled regex to around 64K, which is big enough
|
||||
for almost everybody. However, I received a request for an even bigger limit.
|
||||
For this reason, and also to make the code easier to maintain, the storing and
|
||||
loading of offsets from the byte string is now handled by the macros that are
|
||||
defined here.
|
||||
|
||||
The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
|
||||
the config.h file, but can be overridden by using -D on the command line. This
|
||||
@@ -208,6 +238,7 @@ Standard C system should have one. */
|
||||
#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
|
||||
#endif
|
||||
|
||||
|
||||
/* These are the public options that can change during matching. */
|
||||
|
||||
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
|
||||
@@ -216,12 +247,13 @@ Standard C system should have one. */
|
||||
but skip the top bit so we can use ints for convenience without getting tangled
|
||||
with negative values. The public options defined in pcre.h start at the least
|
||||
significant end. Make sure they don't overlap, though now that we have expanded
|
||||
to four bytes there is plenty of space. */
|
||||
to four bytes, there is plenty of space. */
|
||||
|
||||
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
|
||||
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
|
||||
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
|
||||
#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
|
||||
#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
|
||||
|
||||
/* Options for the "extra" block produced by pcre_study(). */
|
||||
|
||||
@@ -233,10 +265,11 @@ time, run time or study time, respectively. */
|
||||
#define PUBLIC_OPTIONS \
|
||||
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
|
||||
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)
|
||||
|
||||
#define PUBLIC_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL)
|
||||
|
||||
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
||||
|
||||
@@ -296,12 +329,13 @@ definitions below, up to ESC_z. There's a dummy for OP_ANY because it
|
||||
corresponds to "." rather than an escape sequence. The final one must be
|
||||
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
|
||||
tests in the code for an escape greater than ESC_b and less than ESC_Z to
|
||||
detect the types that may be repeated. These are the types that consume a
|
||||
character. If any new escapes are put in between that don't consume a
|
||||
detect the types that may be repeated. These are the types that consume
|
||||
characters. If any new escapes are put in between that don't consume a
|
||||
character, that code will have to change. */
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
|
||||
ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
|
||||
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
|
||||
ESC_Q, ESC_REF };
|
||||
|
||||
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
|
||||
contain UTF-8 characters with values greater than 255. */
|
||||
@@ -312,6 +346,8 @@ contain UTF-8 characters with values greater than 255. */
|
||||
#define XCL_END 0 /* Marks end of individual items */
|
||||
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
|
||||
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
|
||||
#define XCL_PROP 3 /* Unicode property (one property code) follows */
|
||||
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
|
||||
|
||||
|
||||
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
|
||||
@@ -337,100 +373,112 @@ enum {
|
||||
OP_WORDCHAR, /* 10 \w */
|
||||
OP_ANY, /* 11 Match any character */
|
||||
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_EODN, /* 13 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 14 End of data: \z */
|
||||
OP_NOTPROP, /* 13 \P (not Unicode property) */
|
||||
OP_PROP, /* 14 \p (Unicode property) */
|
||||
OP_EXTUNI, /* 15 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 16 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 17 End of data: \z */
|
||||
|
||||
OP_OPT, /* 15 Set runtime options */
|
||||
OP_CIRC, /* 16 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 17 End of line - varies with multiline switch */
|
||||
OP_CHARS, /* 18 Match string of characters */
|
||||
OP_NOT, /* 19 Match anything but the following char */
|
||||
OP_OPT, /* 18 Set runtime options */
|
||||
OP_CIRC, /* 19 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 20 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 21 Match one character, casefully */
|
||||
OP_CHARNC, /* 22 Match one character, caselessly */
|
||||
OP_NOT, /* 23 Match anything but the following char */
|
||||
|
||||
OP_STAR, /* 20 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 22 the minimizing one second. */
|
||||
OP_MINPLUS, /* 23 This first set applies to single characters */
|
||||
OP_QUERY, /* 24 */
|
||||
OP_MINQUERY, /* 25 */
|
||||
OP_UPTO, /* 26 From 0 to n matches */
|
||||
OP_MINUPTO, /* 27 */
|
||||
OP_EXACT, /* 28 Exactly n matches */
|
||||
OP_STAR, /* 24 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 26 the minimizing one second. */
|
||||
OP_MINPLUS, /* 27 This first set applies to single characters */
|
||||
OP_QUERY, /* 28 */
|
||||
OP_MINQUERY, /* 29 */
|
||||
OP_UPTO, /* 30 From 0 to n matches */
|
||||
OP_MINUPTO, /* 31 */
|
||||
OP_EXACT, /* 32 Exactly n matches */
|
||||
|
||||
OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 31 the minimizing one second. */
|
||||
OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */
|
||||
OP_NOTQUERY, /* 33 */
|
||||
OP_NOTMINQUERY, /* 34 */
|
||||
OP_NOTUPTO, /* 35 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 36 */
|
||||
OP_NOTEXACT, /* 37 Exactly n matches */
|
||||
OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 35 the minimizing one second. */
|
||||
OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
|
||||
OP_NOTQUERY, /* 37 */
|
||||
OP_NOTMINQUERY, /* 38 */
|
||||
OP_NOTUPTO, /* 39 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 40 */
|
||||
OP_NOTEXACT, /* 41 Exactly n matches */
|
||||
|
||||
OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 42 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 43 */
|
||||
OP_TYPEUPTO, /* 44 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 45 */
|
||||
OP_TYPEEXACT, /* 46 Exactly n matches */
|
||||
OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 47 */
|
||||
OP_TYPEUPTO, /* 48 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 49 */
|
||||
OP_TYPEEXACT, /* 50 Exactly n matches */
|
||||
|
||||
OP_CRSTAR, /* 47 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 49 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 51 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 52 */
|
||||
OP_CRRANGE, /* 53 These are different to the three seta above. */
|
||||
OP_CRMINRANGE, /* 54 */
|
||||
OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 53 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 55 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 56 */
|
||||
OP_CRRANGE, /* 57 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 58 */
|
||||
|
||||
OP_CLASS, /* 55 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 56 Same, but the bitmap was created from a negative
|
||||
OP_CLASS, /* 59 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a UTF-8
|
||||
character > 255 is encountered. */
|
||||
|
||||
OP_XCLASS, /* 57 Extended class for handling UTF-8 chars within the
|
||||
OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
|
||||
class. This does both positive and negative. */
|
||||
|
||||
OP_REF, /* 58 Match a back reference */
|
||||
OP_RECURSE, /* 59 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 60 Call out to external function if provided */
|
||||
OP_REF, /* 62 Match a back reference */
|
||||
OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 64 Call out to external function if provided */
|
||||
|
||||
OP_ALT, /* 61 Start of alternation */
|
||||
OP_KET, /* 62 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 63 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 64 order. They are for groups the repeat for ever. */
|
||||
OP_ALT, /* 65 Start of alternation */
|
||||
OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 67 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
|
||||
|
||||
/* The assertions must come before ONCE and COND */
|
||||
|
||||
OP_ASSERT, /* 65 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 66 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 67 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
|
||||
OP_REVERSE, /* 69 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 69 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 70 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 71 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
|
||||
OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
|
||||
|
||||
/* ONCE and COND must come after the assertions, with ONCE first, as there's
|
||||
a test for >= ONCE for a subpattern that isn't an assertion. */
|
||||
|
||||
OP_ONCE, /* 70 Once matched, don't back up into the subpattern */
|
||||
OP_COND, /* 71 Conditional group */
|
||||
OP_CREF, /* 72 Used to hold an extraction string number (cond ref) */
|
||||
OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
|
||||
OP_COND, /* 75 Conditional group */
|
||||
OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
|
||||
|
||||
OP_BRAZERO, /* 73 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 74 order. */
|
||||
OP_BRAZERO, /* 77 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 78 order. */
|
||||
|
||||
OP_BRANUMBER, /* 75 Used for extracting brackets whose number is greater
|
||||
OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
|
||||
than can fit into an opcode. */
|
||||
|
||||
OP_BRA /* 76 This and greater values are used for brackets that
|
||||
extract substrings up to a basic limit. After that,
|
||||
use is made of OP_BRANUMBER. */
|
||||
OP_BRA /* 80 This and greater values are used for brackets that
|
||||
extract substrings up to EXTRACT_BASIC_MAX. After
|
||||
that, use is made of OP_BRANUMBER. */
|
||||
};
|
||||
|
||||
/* WARNING: There is an implicit assumption in study.c that all opcodes are
|
||||
less than 128 in value. This makes handling UTF-8 character sequences easier.
|
||||
*/
|
||||
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
|
||||
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
|
||||
character sequences easier. */
|
||||
|
||||
/* The highest extraction number before we have to start using additional
|
||||
bytes. (Originally PCRE didn't have support for extraction counts highter than
|
||||
this number.) The value is limited by the number of opcodes left after OP_BRA,
|
||||
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
|
||||
opcodes. */
|
||||
|
||||
#define EXTRACT_BASIC_MAX 100
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. There are used only
|
||||
@@ -439,8 +487,10 @@ macro is referenced only in printint.c. */
|
||||
|
||||
#define OP_NAME_LIST \
|
||||
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \
|
||||
"Opt", "^", "$", "chars", "not", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
|
||||
"notprop", "prop", "extuni", \
|
||||
"\\Z", "\\z", \
|
||||
"Opt", "^", "$", "char", "charnc", "not", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
@@ -463,8 +513,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
#define OP_LENGTHS \
|
||||
1, /* End */ \
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, 1, 1, 2, 1, 1, /* Any, Anybyte, \Z, \z, Opt, ^, $ */ \
|
||||
2, /* Chars - the minimum length */ \
|
||||
1, 1, /* Any, Anybyte */ \
|
||||
2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
|
||||
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Charnc - the minimum length */ \
|
||||
2, /* not */ \
|
||||
/* Positive single-char repeats ** These are */ \
|
||||
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
||||
@@ -483,7 +536,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
0, /* XCLASS - variable length */ \
|
||||
3, /* REF */ \
|
||||
1+LINK_SIZE, /* RECURSE */ \
|
||||
2, /* CALLOUT */ \
|
||||
2+2*LINK_SIZE, /* CALLOUT */ \
|
||||
1+LINK_SIZE, /* Alt */ \
|
||||
1+LINK_SIZE, /* Ket */ \
|
||||
1+LINK_SIZE, /* KetRmax */ \
|
||||
@@ -501,14 +554,6 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1+LINK_SIZE /* BRA */ \
|
||||
|
||||
|
||||
/* The highest extraction number before we have to start using additional
|
||||
bytes. (Originally PCRE didn't have support for extraction counts highter than
|
||||
this number.) The value is limited by the number of opcodes left after OP_BRA,
|
||||
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
|
||||
opcodes. */
|
||||
|
||||
#define EXTRACT_BASIC_MAX 150
|
||||
|
||||
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
|
||||
|
||||
#define CREF_RECURSE 0xffff
|
||||
@@ -554,7 +599,7 @@ just to accommodate the POSIX wrapper. */
|
||||
#define ERR34 "character value in \\x{...} sequence is too large"
|
||||
#define ERR35 "invalid condition (?(0)"
|
||||
#define ERR36 "\\C not allowed in lookbehind assertion"
|
||||
#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
|
||||
#define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
|
||||
#define ERR38 "number after (?C is > 255"
|
||||
#define ERR39 "closing ) for (?C expected"
|
||||
#define ERR40 "recursive call could loop indefinitely"
|
||||
@@ -562,37 +607,51 @@ just to accommodate the POSIX wrapper. */
|
||||
#define ERR42 "syntax error after (?P"
|
||||
#define ERR43 "two named groups have the same name"
|
||||
#define ERR44 "invalid UTF-8 string"
|
||||
|
||||
/* All character handling must be done as unsigned characters. Otherwise there
|
||||
are problems with top-bit-set characters and functions such as isspace().
|
||||
However, we leave the interface to the outside world as char *, because that
|
||||
should make things easier for callers. We define a short type for unsigned char
|
||||
to save lots of typing. I tried "uchar", but it causes problems on Digital
|
||||
Unix, where it is defined in sys/types, so use "uschar" instead. */
|
||||
|
||||
typedef unsigned char uschar;
|
||||
#define ERR45 "support for \\P, \\p, and \\X has not been compiled"
|
||||
#define ERR46 "malformed \\P or \\p sequence"
|
||||
#define ERR47 "unknown property name after \\P or \\p"
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. */
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
offset to the name table so that if a regex is compiled on one host, saved, and
|
||||
then run on another where the size of pointers is different, all might still
|
||||
be well. For the case of compiled-on-4 and run-on-8, we include an extra
|
||||
pointer that is always NULL. For future-proofing, we also include a few dummy
|
||||
fields - even though you can never get this planning right!
|
||||
|
||||
NOTE NOTE NOTE:
|
||||
Because people can now save and re-use compiled patterns, any additions to this
|
||||
structure should be made at the end, and something earlier (e.g. a new
|
||||
flag in the options or one of the dummy fields) should indicate that the new
|
||||
fields are present. Currently PCRE always sets the dummy fields to zero.
|
||||
NOTE NOTE NOTE:
|
||||
*/
|
||||
|
||||
typedef struct real_pcre {
|
||||
unsigned long int magic_number;
|
||||
size_t size; /* Total that was malloced */
|
||||
const unsigned char *tables; /* Pointer to tables */
|
||||
unsigned long int options;
|
||||
unsigned short int top_bracket;
|
||||
unsigned short int top_backref;
|
||||
unsigned short int first_byte;
|
||||
unsigned short int req_byte;
|
||||
unsigned short int name_entry_size; /* Size of any name items; 0 => none */
|
||||
unsigned short int name_count; /* Number of name items */
|
||||
pcre_uint32 magic_number;
|
||||
pcre_uint32 size; /* Total that was malloced */
|
||||
pcre_uint32 options;
|
||||
pcre_uint32 dummy1; /* For future use, maybe */
|
||||
|
||||
pcre_uint16 top_bracket;
|
||||
pcre_uint16 top_backref;
|
||||
pcre_uint16 first_byte;
|
||||
pcre_uint16 req_byte;
|
||||
pcre_uint16 name_table_offset; /* Offset to name table that follows */
|
||||
pcre_uint16 name_entry_size; /* Size of any name items */
|
||||
pcre_uint16 name_count; /* Number of name items */
|
||||
pcre_uint16 dummy2; /* For future use, maybe */
|
||||
|
||||
const unsigned char *tables; /* Pointer to tables or NULL for std */
|
||||
const unsigned char *nullpad; /* NULL padding */
|
||||
} real_pcre;
|
||||
|
||||
/* The format of the block used to store data from pcre_study(). */
|
||||
/* The format of the block used to store data from pcre_study(). The same
|
||||
remark (see NOTE above) about extending this structure applies. */
|
||||
|
||||
typedef struct pcre_study_data {
|
||||
size_t size; /* Total that was malloced */
|
||||
uschar options;
|
||||
pcre_uint32 size; /* Total that was malloced */
|
||||
pcre_uint32 options;
|
||||
uschar start_bits[32];
|
||||
} pcre_study_data;
|
||||
|
||||
@@ -605,12 +664,14 @@ typedef struct compile_data {
|
||||
const uschar *cbits; /* Points to character type table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
const uschar *start_code; /* The start of the compiled code */
|
||||
const uschar *start_pattern; /* The start of the pattern */
|
||||
uschar *name_table; /* The name/number table */
|
||||
int names_found; /* Number of entries so far */
|
||||
int name_entry_size; /* Size of each entry */
|
||||
int top_backref; /* Maximum back reference */
|
||||
unsigned int backref_map; /* Bitmap of low back refs */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL nopartial; /* Set TRUE if partial won't work */
|
||||
} compile_data;
|
||||
|
||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||
@@ -660,6 +721,8 @@ typedef struct match_data {
|
||||
BOOL utf8; /* UTF8 flag */
|
||||
BOOL endonly; /* Dollar not before final \n */
|
||||
BOOL notempty; /* Empty string match not wanted */
|
||||
BOOL partial; /* PARTIAL flag */
|
||||
BOOL hitend; /* Hit the end of the subject at some point */
|
||||
const uschar *start_code; /* For use when recursing */
|
||||
const uschar *start_subject; /* Start of the subject string */
|
||||
const uschar *end_subject; /* End of the subject string */
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
LIBRARY libpcre
|
||||
EXPORTS
|
||||
pcre_malloc
|
||||
pcre_free
|
||||
pcre_config
|
||||
pcre_callout
|
||||
pcre_compile
|
||||
pcre_copy_substring
|
||||
pcre_exec
|
||||
pcre_get_substring
|
||||
pcre_get_stringnumber
|
||||
pcre_get_substring_list
|
||||
pcre_free_substring
|
||||
pcre_free_substring_list
|
||||
pcre_info
|
||||
pcre_fullinfo
|
||||
pcre_maketables
|
||||
pcre_study
|
||||
pcre_version
|
||||
@@ -0,0 +1,24 @@
|
||||
LIBRARY libpcreposix
|
||||
EXPORTS
|
||||
pcre_malloc
|
||||
pcre_free
|
||||
pcre_config
|
||||
pcre_callout
|
||||
pcre_compile
|
||||
pcre_copy_substring
|
||||
pcre_exec
|
||||
pcre_get_substring
|
||||
pcre_get_stringnumber
|
||||
pcre_get_substring_list
|
||||
pcre_free_substring
|
||||
pcre_free_substring_list
|
||||
pcre_info
|
||||
pcre_fullinfo
|
||||
pcre_maketables
|
||||
pcre_study
|
||||
pcre_version
|
||||
|
||||
regcomp
|
||||
regexec
|
||||
regerror
|
||||
regfree
|
||||
@@ -8,29 +8,35 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
Copyright (c) 1997-2003 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
See the file Tech.Notes for some information on the internals.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
+1607
-716
File diff suppressed because it is too large
Load Diff
+52
-6
@@ -2,7 +2,39 @@
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* Copyright (c) 1997-2003 University of Cambridge */
|
||||
/* In its original form, this is the .in file that is transformed by
|
||||
"configure" into pcre.h.
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef _PCRE_H
|
||||
#define _PCRE_H
|
||||
@@ -12,9 +44,9 @@ make changes to pcre.in. */
|
||||
|
||||
#include "php_compat.h"
|
||||
|
||||
#define PCRE_MAJOR 4
|
||||
#define PCRE_MINOR 5
|
||||
#define PCRE_DATE 01-December-2003
|
||||
#define PCRE_MAJOR 5
|
||||
#define PCRE_MINOR 0
|
||||
#define PCRE_DATE 13-Sep-2004
|
||||
|
||||
/* Win32 uses DLL by default */
|
||||
|
||||
@@ -60,6 +92,8 @@ extern "C" {
|
||||
#define PCRE_UTF8 0x0800
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x1000
|
||||
#define PCRE_NO_UTF8_CHECK 0x2000
|
||||
#define PCRE_AUTO_CALLOUT 0x4000
|
||||
#define PCRE_PARTIAL 0x8000
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@@ -74,6 +108,10 @@ extern "C" {
|
||||
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
|
||||
#define PCRE_ERROR_BADUTF8 (-10)
|
||||
#define PCRE_ERROR_BADUTF8_OFFSET (-11)
|
||||
#define PCRE_ERROR_PARTIAL (-12)
|
||||
#define PCRE_ERROR_BADPARTIAL (-13)
|
||||
#define PCRE_ERROR_INTERNAL (-14)
|
||||
#define PCRE_ERROR_BADCOUNT (-15)
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
|
||||
@@ -89,6 +127,7 @@ extern "C" {
|
||||
#define PCRE_INFO_NAMECOUNT 8
|
||||
#define PCRE_INFO_NAMETABLE 9
|
||||
#define PCRE_INFO_STUDYSIZE 10
|
||||
#define PCRE_INFO_DEFAULT_TABLES 11
|
||||
|
||||
/* Request types for pcre_config() */
|
||||
|
||||
@@ -98,12 +137,14 @@ extern "C" {
|
||||
#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
|
||||
#define PCRE_CONFIG_MATCH_LIMIT 4
|
||||
#define PCRE_CONFIG_STACKRECURSE 5
|
||||
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
|
||||
|
||||
/* Bit flags for the pcre_extra structure */
|
||||
|
||||
#define PCRE_EXTRA_STUDY_DATA 0x0001
|
||||
#define PCRE_EXTRA_MATCH_LIMIT 0x0002
|
||||
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
|
||||
#define PCRE_EXTRA_TABLES 0x0008
|
||||
|
||||
/* Types */
|
||||
|
||||
@@ -111,13 +152,15 @@ struct real_pcre; /* declaration; the definition is private */
|
||||
typedef struct real_pcre pcre;
|
||||
|
||||
/* The structure for passing additional data to pcre_exec(). This is defined in
|
||||
such as way as to be extensible. */
|
||||
such as way as to be extensible. Always add new fields at the end, in order to
|
||||
remain compatible. */
|
||||
|
||||
typedef struct pcre_extra {
|
||||
unsigned long int flags; /* Bits for which fields are set */
|
||||
void *study_data; /* Opaque data from pcre_study() */
|
||||
unsigned long int match_limit; /* Maximum number of calls to match() */
|
||||
void *callout_data; /* Data passed back in callouts */
|
||||
const unsigned char *tables; /* Pointer to character tables */
|
||||
} pcre_extra;
|
||||
|
||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
||||
@@ -133,10 +176,13 @@ typedef struct pcre_callout_block {
|
||||
const char *subject; /* The subject being matched */
|
||||
int subject_length; /* The length of the subject */
|
||||
int start_match; /* Offset to start of this match attempt */
|
||||
int current_position; /* Where we currently are */
|
||||
int current_position; /* Where we currently are in the subject */
|
||||
int capture_top; /* Max current capture */
|
||||
int capture_last; /* Most recently closed capture */
|
||||
void *callout_data; /* Data passed in with the call */
|
||||
/* ------------------- Added for Version 1 -------------------------- */
|
||||
int pattern_position; /* Offset to next item in the pattern */
|
||||
int next_item_length; /* Length of next item in the pattern */
|
||||
/* ------------------------------------------------------------------ */
|
||||
} pcre_callout_block;
|
||||
|
||||
|
||||
@@ -0,0 +1,324 @@
|
||||
/*************************************************
|
||||
* PCRE DEMONSTRATION PROGRAM *
|
||||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate the most straightforward ways
|
||||
of calling the PCRE regular expression library from a C program. See the
|
||||
pcresample documentation for a short discussion.
|
||||
|
||||
Compile thuswise:
|
||||
gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
|
||||
-R/usr/local/lib -lpcre
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
library files for PCRE are installed on your system. Only some operating
|
||||
systems (e.g. Solaris) use the -R option.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <pcre.h>
|
||||
|
||||
#define OVECCOUNT 30 /* should be a multiple of 3 */
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
pcre *re;
|
||||
const char *error;
|
||||
char *pattern;
|
||||
char *subject;
|
||||
unsigned char *name_table;
|
||||
int erroffset;
|
||||
int find_all;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int ovector[OVECCOUNT];
|
||||
int subject_length;
|
||||
int rc, i;
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else break;
|
||||
}
|
||||
|
||||
/* After the options, we require exactly two arguments, which are the pattern,
|
||||
and the subject string. */
|
||||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
pattern = argv[i];
|
||||
subject = argv[i+1];
|
||||
subject_length = (int)strlen(subject);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* Now we are going to compile the regular expression pattern, and handle *
|
||||
* and errors that are detected. *
|
||||
*************************************************************************/
|
||||
|
||||
re = pcre_compile(
|
||||
pattern, /* the pattern */
|
||||
0, /* default options */
|
||||
&error, /* for error message */
|
||||
&erroffset, /* for error offset */
|
||||
NULL); /* use default character tables */
|
||||
|
||||
/* Compilation failed: print the error message and exit */
|
||||
|
||||
if (re == NULL)
|
||||
{
|
||||
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* If the compilation succeeded, we call PCRE again, in order to do a *
|
||||
* pattern match against the subject string. This does just ONE match. If *
|
||||
* further matching is needed, it will be done below. *
|
||||
*************************************************************************/
|
||||
|
||||
rc = pcre_exec(
|
||||
re, /* the compiled pattern */
|
||||
NULL, /* no extra data - we didn't study the pattern */
|
||||
subject, /* the subject string */
|
||||
subject_length, /* the length of the subject */
|
||||
0, /* start at offset 0 in the subject */
|
||||
0, /* default options */
|
||||
ovector, /* output vector for substring information */
|
||||
OVECCOUNT); /* number of elements in the output vector */
|
||||
|
||||
/* Matching failed: handle error cases */
|
||||
|
||||
if (rc < 0)
|
||||
{
|
||||
switch(rc)
|
||||
{
|
||||
case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
|
||||
/*
|
||||
Handle other special cases if you like
|
||||
*/
|
||||
default: printf("Matching error %d\n", rc); break;
|
||||
}
|
||||
free(re); /* Release memory used for the compiled pattern */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
|
||||
printf("\nMatch succeeded at offset %d\n", ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* We have found the first match within the subject string. If the output *
|
||||
* vector wasn't big enough, set its size to the maximum. Then output any *
|
||||
* substrings that were captured. *
|
||||
*************************************************************************/
|
||||
|
||||
/* The output vector wasn't big enough */
|
||||
|
||||
if (rc == 0)
|
||||
{
|
||||
rc = OVECCOUNT/3;
|
||||
printf("ovector only has room for %d captured substrings\n", rc - 1);
|
||||
}
|
||||
|
||||
/* Show substrings stored in the output vector by number. Obviously, in a real
|
||||
application you might want to do things other than print them. */
|
||||
|
||||
for (i = 0; i < rc; i++)
|
||||
{
|
||||
char *substring_start = subject + ovector[2*i];
|
||||
int substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
printf("%2d: %.*s\n", i, substring_length, substring_start);
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
* That concludes the basic part of this demonstration program. We have *
|
||||
* compiled a pattern, and performed a single match. The code that follows *
|
||||
* first shows how to access named substrings, and then how to code for *
|
||||
* repeated matches on the same subject. *
|
||||
**************************************************************************/
|
||||
|
||||
/* See if there are any named substrings, and if so, show them by name. First
|
||||
we have to extract the count of named parentheses from the pattern. */
|
||||
|
||||
(void)pcre_fullinfo(
|
||||
re, /* the compiled pattern */
|
||||
NULL, /* no extra data - we didn't study the pattern */
|
||||
PCRE_INFO_NAMECOUNT, /* number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
{
|
||||
unsigned char *tabptr;
|
||||
printf("Named substrings\n");
|
||||
|
||||
/* Before we can access the substrings, we must extract the table for
|
||||
translating names to numbers, and the size of each entry in the table. */
|
||||
|
||||
(void)pcre_fullinfo(
|
||||
re, /* the compiled pattern */
|
||||
NULL, /* no extra data - we didn't study the pattern */
|
||||
PCRE_INFO_NAMETABLE, /* address of the table */
|
||||
&name_table); /* where to put the answer */
|
||||
|
||||
(void)pcre_fullinfo(
|
||||
re, /* the compiled pattern */
|
||||
NULL, /* no extra data - we didn't study the pattern */
|
||||
PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
|
||||
&name_entry_size); /* where to put the answer */
|
||||
|
||||
/* Now we can scan the table and, for each entry, print the number, the name,
|
||||
and the substring itself. */
|
||||
|
||||
tabptr = name_table;
|
||||
for (i = 0; i < namecount; i++)
|
||||
{
|
||||
int n = (tabptr[0] << 8) | tabptr[1];
|
||||
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
|
||||
ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
|
||||
tabptr += name_entry_size;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* If the "-g" option was given on the command line, we want to continue *
|
||||
* to search for additional matches in the subject string, in a similar *
|
||||
* way to the /g option in Perl. This turns out to be trickier than you *
|
||||
* might think because of the possibility of matching an empty string. *
|
||||
* What happens is as follows: *
|
||||
* *
|
||||
* If the previous match was NOT for an empty string, we can just start *
|
||||
* the next match at the end of the previous one. *
|
||||
* *
|
||||
* If the previous match WAS for an empty string, we can't do that, as it *
|
||||
* would lead to an infinite loop. Instead, a special call of pcre_exec() *
|
||||
* is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first *
|
||||
* of these tells PCRE that an empty string is not a valid match; other *
|
||||
* possibilities must be tried. The second flag restricts PCRE to one *
|
||||
* match attempt at the initial string position. If this match succeeds, *
|
||||
* an alternative to the empty string match has been found, and we can *
|
||||
* proceed round the loop. *
|
||||
*************************************************************************/
|
||||
|
||||
if (!find_all)
|
||||
{
|
||||
free(re); /* Release the memory used for the compiled pattern */
|
||||
return 0; /* Finish unless -g was given */
|
||||
}
|
||||
|
||||
/* Loop for second and subsequent matches */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
int options = 0; /* Normally no options */
|
||||
int start_offset = ovector[1]; /* Start at end of previous match */
|
||||
|
||||
/* If the previous match was for an empty string, we are finished if we are
|
||||
at the end of the subject. Otherwise, arrange to run another match at the
|
||||
same point to see if a non-empty match can be found. */
|
||||
|
||||
if (ovector[0] == ovector[1])
|
||||
{
|
||||
if (ovector[0] == subject_length) break;
|
||||
options = PCRE_NOTEMPTY | PCRE_ANCHORED;
|
||||
}
|
||||
|
||||
/* Run the next matching operation */
|
||||
|
||||
rc = pcre_exec(
|
||||
re, /* the compiled pattern */
|
||||
NULL, /* no extra data - we didn't study the pattern */
|
||||
subject, /* the subject string */
|
||||
subject_length, /* the length of the subject */
|
||||
start_offset, /* starting offset in the subject */
|
||||
options, /* options */
|
||||
ovector, /* output vector for substring information */
|
||||
OVECCOUNT); /* number of elements in the output vector */
|
||||
|
||||
/* This time, a result of NOMATCH isn't an error. If the value in "options"
|
||||
is zero, it just means we have found all possible matches, so the loop ends.
|
||||
Otherwise, it means we have failed to find a non-empty-string match at a
|
||||
point where there was a previous empty-string match. In this case, we do what
|
||||
Perl does: advance the matching position by one, and continue. We do this by
|
||||
setting the "end of previous match" offset, because that is picked up at the
|
||||
top of the loop as the point at which to start again. */
|
||||
|
||||
if (rc == PCRE_ERROR_NOMATCH)
|
||||
{
|
||||
if (options == 0) break;
|
||||
ovector[1] = start_offset + 1;
|
||||
continue; /* Go round the loop again */
|
||||
}
|
||||
|
||||
/* Other matching errors are not recoverable. */
|
||||
|
||||
if (rc < 0)
|
||||
{
|
||||
printf("Matching error %d\n", rc);
|
||||
free(re); /* Release memory used for the compiled pattern */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
|
||||
printf("\nMatch succeeded again at offset %d\n", ovector[0]);
|
||||
|
||||
/* The match succeeded, but the output vector wasn't big enough. */
|
||||
|
||||
if (rc == 0)
|
||||
{
|
||||
rc = OVECCOUNT/3;
|
||||
printf("ovector only has room for %d captured substrings\n", rc - 1);
|
||||
}
|
||||
|
||||
/* As before, show substrings stored in the output vector by number, and then
|
||||
also any named substrings. */
|
||||
|
||||
for (i = 0; i < rc; i++)
|
||||
{
|
||||
char *substring_start = subject + ovector[2*i];
|
||||
int substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
printf("%2d: %.*s\n", i, substring_length, substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
{
|
||||
unsigned char *tabptr = name_table;
|
||||
printf("Named substrings\n");
|
||||
for (i = 0; i < namecount; i++)
|
||||
{
|
||||
int n = (tabptr[0] << 8) | tabptr[1];
|
||||
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
|
||||
ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
|
||||
tabptr += name_entry_size;
|
||||
}
|
||||
}
|
||||
} /* End of loop to find second and subsequent matches */
|
||||
|
||||
printf("\n");
|
||||
free(re); /* Release memory used for the compiled pattern */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End of pcredemo.c */
|
||||
@@ -4,7 +4,38 @@
|
||||
|
||||
/* This is a grep program that uses the PCRE regular expression library to do
|
||||
its pattern matching. On a Unix or Win32 system it can recurse into
|
||||
directories. */
|
||||
directories.
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
@@ -15,23 +15,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
@@ -48,7 +56,7 @@ static const char *const estring[] = {
|
||||
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
|
||||
ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR29, ERR29, ERR30,
|
||||
ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
|
||||
ERR41, ERR42, ERR43, ERR44 };
|
||||
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
|
||||
|
||||
static const int eint[] = {
|
||||
REG_EESCAPE, /* "\\ at end of pattern" */
|
||||
@@ -87,14 +95,17 @@ static const int eint[] = {
|
||||
REG_BADPAT, /* "character value in \x{...} sequence is too large" */
|
||||
REG_BADPAT, /* "invalid condition (?(0)" */
|
||||
REG_BADPAT, /* "\\C not allowed in lookbehind assertion" */
|
||||
REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" */
|
||||
REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\U, or \\u" */
|
||||
REG_BADPAT, /* "number after (?C is > 255" */
|
||||
REG_BADPAT, /* "closing ) for (?C expected" */
|
||||
REG_BADPAT, /* "recursive call could loop indefinitely" */
|
||||
REG_BADPAT, /* "unrecognized character after (?P" */
|
||||
REG_BADPAT, /* "syntax error after (?P" */
|
||||
REG_BADPAT, /* "two named groups have the same name" */
|
||||
REG_BADPAT /* "invalid UTF-8 string" */
|
||||
REG_BADPAT, /* "invalid UTF-8 string" */
|
||||
REG_BADPAT, /* "support for \\P, \\p, and \\X has not been compiled" */
|
||||
REG_BADPAT, /* "malformed \\P or \\p sequence" */
|
||||
REG_BADPAT /* "unknown property name after \\P or \\p" */
|
||||
};
|
||||
|
||||
/* Table of texts corresponding to POSIX error codes */
|
||||
|
||||
@@ -2,14 +2,43 @@
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* Copyright (c) 1997-2003 University of Cambridge */
|
||||
|
||||
#ifndef _PCREPOSIX_H
|
||||
#define _PCREPOSIX_H
|
||||
|
||||
/* This is the header for the POSIX wrapper interface to the PCRE Perl-
|
||||
Compatible Regular Expression library. It defines the things POSIX says should
|
||||
be there. I hope. */
|
||||
be there. I hope.
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* Have to include stdlib.h in order to ensure that size_t is defined. */
|
||||
|
||||
|
||||
+380
-77
@@ -4,7 +4,37 @@
|
||||
|
||||
/* This program was hacked up as a tester for PCRE. I really should have
|
||||
written it more tidily in the first place. Will I ever learn? It has grown and
|
||||
been extended and consequently is now rather untidy in places. */
|
||||
been extended and consequently is now rather untidy in places.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
@@ -12,6 +42,7 @@ been extended and consequently is now rather untidy in places. */
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <locale.h>
|
||||
#include <errno.h>
|
||||
|
||||
/* We need the internal info for displaying the results of pcre_study(). Also
|
||||
for getting the opcodes for showing compiled code. */
|
||||
@@ -35,9 +66,10 @@ Makefile. */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define LOOPREPEAT 50000
|
||||
#define LOOPREPEAT 500000
|
||||
|
||||
#define BUFFER_SIZE 30000
|
||||
#define PBUFFER_SIZE BUFFER_SIZE
|
||||
#define DBUFFER_SIZE BUFFER_SIZE
|
||||
|
||||
|
||||
@@ -52,6 +84,8 @@ static int show_malloc;
|
||||
static int use_utf8;
|
||||
static size_t gotten_store;
|
||||
|
||||
static uschar *pbuffer = NULL;
|
||||
|
||||
|
||||
static const int utf8_table1[] = {
|
||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||
@@ -71,10 +105,13 @@ static const int utf8_table3[] = {
|
||||
/* The code for doing this is held in a separate file that is also included in
|
||||
pcre.c when it is compiled with the debug switch. It defines a function called
|
||||
print_internals(), which uses a table of opcode lengths defined by the macro
|
||||
OP_LENGTHS, whose name must be OP_lengths. */
|
||||
OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
|
||||
Unicode property names to numbers; this is kept in a separate file. */
|
||||
|
||||
static uschar OP_lengths[] = { OP_LENGTHS };
|
||||
|
||||
#include "ucp.h"
|
||||
#include "ucptypetable.c"
|
||||
#include "printint.c"
|
||||
|
||||
|
||||
@@ -269,7 +306,7 @@ data is not zero. */
|
||||
static int callout(pcre_callout_block *cb)
|
||||
{
|
||||
FILE *f = (first_callout | callout_extra)? outfile : NULL;
|
||||
int i, pre_start, post_start;
|
||||
int i, pre_start, post_start, subject_length;
|
||||
|
||||
if (callout_extra)
|
||||
{
|
||||
@@ -300,16 +337,26 @@ pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
|
||||
post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
|
||||
cb->current_position - cb->start_match, f);
|
||||
|
||||
subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
|
||||
|
||||
(void)pchars((unsigned char *)(cb->subject + cb->current_position),
|
||||
cb->subject_length - cb->current_position, f);
|
||||
|
||||
if (f != NULL) fprintf(f, "\n");
|
||||
|
||||
/* Always print appropriate indicators, with callout number if not already
|
||||
shown */
|
||||
shown. For automatic callouts, show the pattern offset. */
|
||||
|
||||
if (callout_extra) fprintf(outfile, " ");
|
||||
else fprintf(outfile, "%3d ", cb->callout_number);
|
||||
if (cb->callout_number == 255)
|
||||
{
|
||||
fprintf(outfile, "%+3d ", cb->pattern_position);
|
||||
if (cb->pattern_position > 99) fprintf(outfile, "\n ");
|
||||
}
|
||||
else
|
||||
{
|
||||
if (callout_extra) fprintf(outfile, " ");
|
||||
else fprintf(outfile, "%3d ", cb->callout_number);
|
||||
}
|
||||
|
||||
for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
|
||||
fprintf(outfile, "^");
|
||||
@@ -320,6 +367,12 @@ if (post_start > 0)
|
||||
fprintf(outfile, "^");
|
||||
}
|
||||
|
||||
for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
|
||||
fprintf(outfile, " ");
|
||||
|
||||
fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
|
||||
pbuffer + cb->pattern_position);
|
||||
|
||||
fprintf(outfile, "\n");
|
||||
first_callout = 0;
|
||||
|
||||
@@ -395,6 +448,23 @@ if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Byte flipping function *
|
||||
*************************************************/
|
||||
|
||||
static long int
|
||||
byteflip(long int value, int n)
|
||||
{
|
||||
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
|
||||
return ((value & 0x000000ff) << 24) |
|
||||
((value & 0x0000ff00) << 8) |
|
||||
((value & 0x00ff0000) >> 8) |
|
||||
((value & 0xff000000) >> 24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main Program *
|
||||
*************************************************/
|
||||
@@ -429,8 +499,15 @@ when I am debugging. */
|
||||
|
||||
buffer = (unsigned char *)malloc(BUFFER_SIZE);
|
||||
dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
|
||||
pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
|
||||
|
||||
/* Static so that new_malloc can use it. */
|
||||
/* The outfile variable is static so that new_malloc can use it. The _setmode()
|
||||
stuff is some magic that I don't understand, but which apparently does good
|
||||
things in Windows. It's related to line terminations. */
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
_setmode( _fileno( stdout ), 0x8000 );
|
||||
#endif /* defined(_WIN32) || defined(WIN32) */
|
||||
|
||||
outfile = stdout;
|
||||
|
||||
@@ -462,6 +539,8 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
printf("Compiled with\n");
|
||||
(void)pcre_config(PCRE_CONFIG_UTF8, &rc);
|
||||
printf(" %sUTF-8 support\n", rc? "" : "No ");
|
||||
(void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
|
||||
printf(" %sUnicode properties support\n", rc? "" : "No ");
|
||||
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
|
||||
printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
|
||||
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
|
||||
@@ -481,11 +560,12 @@ while (argc > 1 && argv[op][0] == '-')
|
||||
printf(" -C show PCRE compile-time options and exit\n");
|
||||
printf(" -d debug: show compiled code; implies -i\n"
|
||||
" -i show information about compiled pattern\n"
|
||||
" -m output memory used information\n"
|
||||
" -o <n> set size of offsets vector to <n>\n");
|
||||
#if !defined NOPOSIX
|
||||
printf(" -p use POSIX interface\n");
|
||||
#endif
|
||||
printf(" -s output store information\n"
|
||||
printf(" -s output store (memory) used information\n"
|
||||
" -t time compilation and execution\n");
|
||||
return 1;
|
||||
}
|
||||
@@ -508,7 +588,7 @@ if (offsets == NULL)
|
||||
|
||||
if (argc > 1)
|
||||
{
|
||||
infile = fopen(argv[op], "r");
|
||||
infile = fopen(argv[op], "rb");
|
||||
if (infile == NULL)
|
||||
{
|
||||
printf("** Failed to open %s\n", argv[op]);
|
||||
@@ -518,7 +598,7 @@ if (argc > 1)
|
||||
|
||||
if (argc > 2)
|
||||
{
|
||||
outfile = fopen(argv[op+1], "w");
|
||||
outfile = fopen(argv[op+1], "wb");
|
||||
if (outfile == NULL)
|
||||
{
|
||||
printf("** Failed to open %s\n", argv[op+1]);
|
||||
@@ -551,13 +631,17 @@ while (!done)
|
||||
|
||||
const char *error;
|
||||
unsigned char *p, *pp, *ppp;
|
||||
unsigned char *to_file = NULL;
|
||||
const unsigned char *tables = NULL;
|
||||
unsigned long int true_size, true_study_size = 0;
|
||||
size_t size, regex_gotten_store;
|
||||
int do_study = 0;
|
||||
int do_debug = debug;
|
||||
int do_G = 0;
|
||||
int do_g = 0;
|
||||
int do_showinfo = showinfo;
|
||||
int do_showrest = 0;
|
||||
int do_flip = 0;
|
||||
int erroroffset, len, delimiter;
|
||||
|
||||
use_utf8 = 0;
|
||||
@@ -571,8 +655,93 @@ while (!done)
|
||||
while (isspace(*p)) p++;
|
||||
if (*p == 0) continue;
|
||||
|
||||
/* Get the delimiter and seek the end of the pattern; if is isn't
|
||||
complete, read more. */
|
||||
/* See if the pattern is to be loaded pre-compiled from a file. */
|
||||
|
||||
if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
|
||||
{
|
||||
unsigned long int magic;
|
||||
uschar sbuf[8];
|
||||
FILE *f;
|
||||
|
||||
p++;
|
||||
pp = p + (int)strlen((char *)p);
|
||||
while (isspace(pp[-1])) pp--;
|
||||
*pp = 0;
|
||||
|
||||
f = fopen((char *)p, "rb");
|
||||
if (f == NULL)
|
||||
{
|
||||
fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
|
||||
|
||||
true_size =
|
||||
(sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
|
||||
true_study_size =
|
||||
(sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
|
||||
|
||||
re = (real_pcre *)new_malloc(true_size);
|
||||
regex_gotten_store = gotten_store;
|
||||
|
||||
if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
|
||||
|
||||
magic = ((real_pcre *)re)->magic_number;
|
||||
if (magic != MAGIC_NUMBER)
|
||||
{
|
||||
if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
|
||||
{
|
||||
do_flip = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
|
||||
fclose(f);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(outfile, "Compiled regex%s loaded from %s\n",
|
||||
do_flip? " (byte-inverted)" : "", p);
|
||||
|
||||
/* Need to know if UTF-8 for printing data strings */
|
||||
|
||||
new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
|
||||
use_utf8 = (options & PCRE_UTF8) != 0;
|
||||
|
||||
/* Now see if there is any following study data */
|
||||
|
||||
if (true_study_size != 0)
|
||||
{
|
||||
pcre_study_data *psd;
|
||||
|
||||
extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
|
||||
extra->flags = PCRE_EXTRA_STUDY_DATA;
|
||||
|
||||
psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
|
||||
extra->study_data = psd;
|
||||
|
||||
if (fread(psd, 1, true_study_size, f) != true_study_size)
|
||||
{
|
||||
FAIL_READ:
|
||||
fprintf(outfile, "Failed to read data from %s\n", p);
|
||||
if (extra != NULL) new_free(extra);
|
||||
if (re != NULL) new_free(re);
|
||||
fclose(f);
|
||||
continue;
|
||||
}
|
||||
fprintf(outfile, "Study data loaded from %s\n", p);
|
||||
do_study = 1; /* To get the data output if requested */
|
||||
}
|
||||
else fprintf(outfile, "No study data\n");
|
||||
|
||||
fclose(f);
|
||||
goto SHOW_INFO;
|
||||
}
|
||||
|
||||
/* In-line pattern (the usual case). Get the delimiter and seek the end of
|
||||
the pattern; if is isn't complete, read more. */
|
||||
|
||||
delimiter = *p++;
|
||||
|
||||
@@ -617,9 +786,11 @@ while (!done)
|
||||
|
||||
if (pp[1] == '\\') *pp++ = '\\';
|
||||
|
||||
/* Terminate the pattern at the delimiter */
|
||||
/* Terminate the pattern at the delimiter, and save a copy of the pattern
|
||||
for callouts. */
|
||||
|
||||
*pp++ = 0;
|
||||
strcpy((char *)pbuffer, (char *)p);
|
||||
|
||||
/* Look for options after final delimiter */
|
||||
|
||||
@@ -639,8 +810,10 @@ while (!done)
|
||||
|
||||
case '+': do_showrest = 1; break;
|
||||
case 'A': options |= PCRE_ANCHORED; break;
|
||||
case 'C': options |= PCRE_AUTO_CALLOUT; break;
|
||||
case 'D': do_debug = do_showinfo = 1; break;
|
||||
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
|
||||
case 'F': do_flip = 1; break;
|
||||
case 'G': do_G = 1; break;
|
||||
case 'I': do_showinfo = 1; break;
|
||||
case 'M': log_store = 1; break;
|
||||
@@ -669,7 +842,15 @@ while (!done)
|
||||
pp = ppp;
|
||||
break;
|
||||
|
||||
case '>':
|
||||
to_file = pp;
|
||||
while (*pp != 0) pp++;
|
||||
while (isspace(pp[-1])) pp--;
|
||||
*pp = 0;
|
||||
break;
|
||||
|
||||
case '\n': case ' ': break;
|
||||
|
||||
default:
|
||||
fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
|
||||
goto SKIP_DATA;
|
||||
@@ -685,6 +866,7 @@ while (!done)
|
||||
{
|
||||
int rc;
|
||||
int cflags = 0;
|
||||
|
||||
if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
|
||||
if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
|
||||
rc = regcomp(&preg, (char *)p, cflags);
|
||||
@@ -759,14 +941,77 @@ while (!done)
|
||||
sizeof(real_pcre) -
|
||||
((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
|
||||
|
||||
/* Extract the size for possible writing before possibly flipping it,
|
||||
and remember the store that was got. */
|
||||
|
||||
true_size = ((real_pcre *)re)->size;
|
||||
regex_gotten_store = gotten_store;
|
||||
|
||||
/* If /S was present, study the regexp to generate additional info to
|
||||
help with the matching. */
|
||||
|
||||
if (do_study)
|
||||
{
|
||||
if (timeit)
|
||||
{
|
||||
register int i;
|
||||
clock_t time_taken;
|
||||
clock_t start_time = clock();
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
time_taken = clock() - start_time;
|
||||
if (extra != NULL) free(extra);
|
||||
fprintf(outfile, " Study time %.3f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
||||
(double)CLOCKS_PER_SEC);
|
||||
}
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
if (error != NULL)
|
||||
fprintf(outfile, "Failed to study: %s\n", error);
|
||||
else if (extra != NULL)
|
||||
true_study_size = ((pcre_study_data *)(extra->study_data))->size;
|
||||
}
|
||||
|
||||
/* If the 'F' option was present, we flip the bytes of all the integer
|
||||
fields in the regex data block and the study block. This is to make it
|
||||
possible to test PCRE's handling of byte-flipped patterns, e.g. those
|
||||
compiled on a different architecture. */
|
||||
|
||||
if (do_flip)
|
||||
{
|
||||
real_pcre *rre = (real_pcre *)re;
|
||||
rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
|
||||
rre->size = byteflip(rre->size, sizeof(rre->size));
|
||||
rre->options = byteflip(rre->options, sizeof(rre->options));
|
||||
rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
|
||||
rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
|
||||
rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
|
||||
rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
|
||||
rre->name_table_offset = byteflip(rre->name_table_offset,
|
||||
sizeof(rre->name_table_offset));
|
||||
rre->name_entry_size = byteflip(rre->name_entry_size,
|
||||
sizeof(rre->name_entry_size));
|
||||
rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
|
||||
|
||||
if (extra != NULL)
|
||||
{
|
||||
pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
|
||||
rsd->size = byteflip(rsd->size, sizeof(rsd->size));
|
||||
rsd->options = byteflip(rsd->options, sizeof(rsd->options));
|
||||
}
|
||||
}
|
||||
|
||||
/* Extract information from the compiled data if required */
|
||||
|
||||
SHOW_INFO:
|
||||
|
||||
if (do_showinfo)
|
||||
{
|
||||
unsigned long int get_options;
|
||||
unsigned long int get_options, all_options;
|
||||
int old_first_char, old_options, old_count;
|
||||
int count, backrefmax, first_char, need_char;
|
||||
int nameentrysize, namecount;
|
||||
const uschar *nametable;
|
||||
size_t size;
|
||||
|
||||
if (do_debug)
|
||||
{
|
||||
@@ -802,9 +1047,9 @@ while (!done)
|
||||
get_options, old_options);
|
||||
}
|
||||
|
||||
if (size != gotten_store) fprintf(outfile,
|
||||
if (size != regex_gotten_store) fprintf(outfile,
|
||||
"Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
|
||||
size, gotten_store);
|
||||
size, regex_gotten_store);
|
||||
|
||||
fprintf(outfile, "Capturing subpattern count = %d\n", count);
|
||||
if (backrefmax > 0)
|
||||
@@ -822,6 +1067,18 @@ while (!done)
|
||||
}
|
||||
}
|
||||
|
||||
/* The NOPARTIAL bit is a private bit in the options, so we have
|
||||
to fish it out via out back door */
|
||||
|
||||
all_options = ((real_pcre *)re)->options;
|
||||
if (do_flip)
|
||||
{
|
||||
all_options = byteflip(all_options, sizeof(all_options));
|
||||
}
|
||||
|
||||
if ((all_options & PCRE_NOPARTIAL) != 0)
|
||||
fprintf(outfile, "Partial matching not supported\n");
|
||||
|
||||
if (get_options == 0) fprintf(outfile, "No options\n");
|
||||
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
|
||||
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
|
||||
@@ -871,77 +1128,103 @@ while (!done)
|
||||
else
|
||||
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
|
||||
}
|
||||
}
|
||||
|
||||
/* If /S was present, study the regexp to generate additional info to
|
||||
help with the matching. */
|
||||
|
||||
if (do_study)
|
||||
{
|
||||
if (timeit)
|
||||
{
|
||||
register int i;
|
||||
clock_t time_taken;
|
||||
clock_t start_time = clock();
|
||||
for (i = 0; i < LOOPREPEAT; i++)
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
time_taken = clock() - start_time;
|
||||
if (extra != NULL) free(extra);
|
||||
fprintf(outfile, " Study time %.3f milliseconds\n",
|
||||
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
||||
(double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
extra = pcre_study(re, study_options, &error);
|
||||
if (error != NULL)
|
||||
fprintf(outfile, "Failed to study: %s\n", error);
|
||||
else if (extra == NULL)
|
||||
fprintf(outfile, "Study returned NULL\n");
|
||||
|
||||
/* Don't output study size; at present it is in any case a fixed
|
||||
value, but it varies, depending on the computer architecture, and
|
||||
so messes up the test suite. */
|
||||
so messes up the test suite. (And with the /F option, it might be
|
||||
flipped.) */
|
||||
|
||||
else if (do_showinfo)
|
||||
if (do_study)
|
||||
{
|
||||
size_t size;
|
||||
uschar *start_bits = NULL;
|
||||
new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
|
||||
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
|
||||
/* fprintf(outfile, "Study size = %d\n", size); */
|
||||
if (start_bits == NULL)
|
||||
fprintf(outfile, "No starting character set\n");
|
||||
if (extra == NULL)
|
||||
fprintf(outfile, "Study returned NULL\n");
|
||||
else
|
||||
{
|
||||
int i;
|
||||
int c = 24;
|
||||
fprintf(outfile, "Starting character set: ");
|
||||
for (i = 0; i < 256; i++)
|
||||
uschar *start_bits = NULL;
|
||||
new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
|
||||
|
||||
if (start_bits == NULL)
|
||||
fprintf(outfile, "No starting byte set\n");
|
||||
else
|
||||
{
|
||||
if ((start_bits[i/8] & (1<<(i%8))) != 0)
|
||||
int i;
|
||||
int c = 24;
|
||||
fprintf(outfile, "Starting byte set: ");
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
if (c > 75)
|
||||
if ((start_bits[i/8] & (1<<(i&7))) != 0)
|
||||
{
|
||||
fprintf(outfile, "\n ");
|
||||
c = 2;
|
||||
}
|
||||
if (isprint(i) && i != ' ')
|
||||
{
|
||||
fprintf(outfile, "%c ", i);
|
||||
c += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(outfile, "\\x%02x ", i);
|
||||
c += 5;
|
||||
if (c > 75)
|
||||
{
|
||||
fprintf(outfile, "\n ");
|
||||
c = 2;
|
||||
}
|
||||
if (isprint(i) && i != ' ')
|
||||
{
|
||||
fprintf(outfile, "%c ", i);
|
||||
c += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(outfile, "\\x%02x ", i);
|
||||
c += 5;
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(outfile, "\n");
|
||||
}
|
||||
fprintf(outfile, "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If the '>' option was present, we write out the regex to a file, and
|
||||
that is all. The first 8 bytes of the file are the regex length and then
|
||||
the study length, in big-endian order. */
|
||||
|
||||
if (to_file != NULL)
|
||||
{
|
||||
FILE *f = fopen((char *)to_file, "wb");
|
||||
if (f == NULL)
|
||||
{
|
||||
fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
uschar sbuf[8];
|
||||
sbuf[0] = (true_size >> 24) & 255;
|
||||
sbuf[1] = (true_size >> 16) & 255;
|
||||
sbuf[2] = (true_size >> 8) & 255;
|
||||
sbuf[3] = (true_size) & 255;
|
||||
|
||||
sbuf[4] = (true_study_size >> 24) & 255;
|
||||
sbuf[5] = (true_study_size >> 16) & 255;
|
||||
sbuf[6] = (true_study_size >> 8) & 255;
|
||||
sbuf[7] = (true_study_size) & 255;
|
||||
|
||||
if (fwrite(sbuf, 1, 8, f) < 8 ||
|
||||
fwrite(re, 1, true_size, f) < true_size)
|
||||
{
|
||||
fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(outfile, "Compiled regex written to %s\n", to_file);
|
||||
if (extra != NULL)
|
||||
{
|
||||
if (fwrite(extra->study_data, 1, true_study_size, f) <
|
||||
true_study_size)
|
||||
{
|
||||
fprintf(outfile, "Write error on %s: %s\n", to_file,
|
||||
strerror(errno));
|
||||
}
|
||||
else fprintf(outfile, "Study data written to %s\n", to_file);
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
continue; /* With next regex */
|
||||
}
|
||||
} /* End of non-POSIX compile */
|
||||
|
||||
/* Read data lines and test them */
|
||||
|
||||
@@ -1045,10 +1328,14 @@ while (!done)
|
||||
}
|
||||
break;
|
||||
|
||||
case 0: /* Allows for an empty line */
|
||||
case 0: /* \ followed by EOF allows for an empty line */
|
||||
p--;
|
||||
continue;
|
||||
|
||||
case '>':
|
||||
while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
|
||||
continue;
|
||||
|
||||
case 'A': /* Option setting */
|
||||
options |= PCRE_ANCHORED;
|
||||
continue;
|
||||
@@ -1159,6 +1446,10 @@ while (!done)
|
||||
if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
|
||||
continue;
|
||||
|
||||
case 'P':
|
||||
options |= PCRE_PARTIAL;
|
||||
continue;
|
||||
|
||||
case 'S':
|
||||
show_malloc = 1;
|
||||
continue;
|
||||
@@ -1269,7 +1560,8 @@ while (!done)
|
||||
min = mid;
|
||||
mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
|
||||
}
|
||||
else if (count >= 0 || count == PCRE_ERROR_NOMATCH)
|
||||
else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
|
||||
count == PCRE_ERROR_PARTIAL)
|
||||
{
|
||||
if (mid == min + 1)
|
||||
{
|
||||
@@ -1305,8 +1597,11 @@ while (!done)
|
||||
/* The normal case is just to do the match once, with the default
|
||||
value of match_limit. */
|
||||
|
||||
else count = pcre_exec(re, extra, (char *)bptr, len,
|
||||
start_offset, options | g_notempty, use_offsets, use_size_offsets);
|
||||
else
|
||||
{
|
||||
count = pcre_exec(re, extra, (char *)bptr, len,
|
||||
start_offset, options | g_notempty, use_offsets, use_size_offsets);
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
@@ -1393,6 +1688,14 @@ while (!done)
|
||||
}
|
||||
}
|
||||
|
||||
/* There was a partial match */
|
||||
|
||||
else if (count == PCRE_ERROR_PARTIAL)
|
||||
{
|
||||
fprintf(outfile, "Partial match\n");
|
||||
break; /* Out of the /g loop */
|
||||
}
|
||||
|
||||
/* Failed to match. If this is a /g or /G loop and we previously set
|
||||
g_notempty after a null match, this is not necessarily the end.
|
||||
We want to advance the start offset, and continue. In the case of UTF-8
|
||||
|
||||
+37
-25
@@ -12,23 +12,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Permission is granted to anyone to use this software for any purpose on any
|
||||
computer system, and to redistribute it freely, subject to the following
|
||||
restrictions:
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. This software is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
4. If PCRE is embedded in any software that is released under the GNU
|
||||
General Purpose Licence (GPL), then the terms of that licence shall
|
||||
supersede any condition above with which it is incompatible.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
@@ -57,7 +65,7 @@ Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
|
||||
set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
|
||||
{
|
||||
start_bits[c/8] |= (1 << (c&7));
|
||||
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
|
||||
@@ -123,7 +131,7 @@ do
|
||||
/* Skip over callout */
|
||||
|
||||
case OP_CALLOUT:
|
||||
tcode += 2;
|
||||
tcode += 2 + 2*LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Skip over extended extraction bracket number */
|
||||
@@ -186,11 +194,10 @@ do
|
||||
/* At least one single char sets the bit and stops */
|
||||
|
||||
case OP_EXACT: /* Fall through */
|
||||
tcode++;
|
||||
|
||||
case OP_CHARS: /* Fall through */
|
||||
tcode++;
|
||||
tcode += 2;
|
||||
|
||||
case OP_CHAR:
|
||||
case OP_CHARNC:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
set_bit(start_bits, tcode[1], caseless, cd);
|
||||
@@ -403,8 +410,9 @@ pcre_study(const pcre *external_re, int options, const char **errorptr)
|
||||
uschar start_bits[32];
|
||||
pcre_extra *extra;
|
||||
pcre_study_data *study;
|
||||
const uschar *tables;
|
||||
const real_pcre *re = (const real_pcre *)external_re;
|
||||
uschar *code = (uschar *)re + sizeof(real_pcre) +
|
||||
uschar *code = (uschar *)re + re->name_table_offset +
|
||||
(re->name_count * re->name_entry_size);
|
||||
compile_data compile_block;
|
||||
|
||||
@@ -429,12 +437,16 @@ at present. */
|
||||
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
|
||||
return NULL;
|
||||
|
||||
/* Set the character tables in the block which is passed around */
|
||||
/* Set the character tables in the block that is passed around */
|
||||
|
||||
compile_block.lcc = re->tables + lcc_offset;
|
||||
compile_block.fcc = re->tables + fcc_offset;
|
||||
compile_block.cbits = re->tables + cbits_offset;
|
||||
compile_block.ctypes = re->tables + ctypes_offset;
|
||||
tables = re->tables;
|
||||
if (tables == NULL)
|
||||
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
|
||||
|
||||
compile_block.lcc = tables + lcc_offset;
|
||||
compile_block.fcc = tables + fcc_offset;
|
||||
compile_block.cbits = tables + cbits_offset;
|
||||
compile_block.ctypes = tables + ctypes_offset;
|
||||
|
||||
/* See if we can find a fixed set of initial characters for the pattern. */
|
||||
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
/*************************************************
|
||||
* libucp - Unicode Property Table handler *
|
||||
*************************************************/
|
||||
|
||||
/* This function provides a fast way of obtaining the basic Unicode properties
|
||||
of a character, using a compact binary tree that occupies less than 100K bytes.
|
||||
|
||||
Copyright (c) 2004 University of Cambridge
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#include "ucp.h" /* Exported interface */
|
||||
#include "ucpinternal.h" /* Internal table details */
|
||||
#include "ucptable.c" /* The table itself */
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Search table and return data *
|
||||
*************************************************/
|
||||
|
||||
/* Two values are returned: the category is ucp_C, ucp_L, etc. The detailed
|
||||
character type is ucp_Lu, ucp_Nd, etc.
|
||||
|
||||
Arguments:
|
||||
c the character value
|
||||
type_ptr the detailed character type is returned here
|
||||
case_ptr for letters, the opposite case is returned here, if there
|
||||
is one, else zero
|
||||
|
||||
Returns: the character type category or -1 if not found
|
||||
*/
|
||||
|
||||
static int
|
||||
ucp_findchar(const int c, int *type_ptr, int *case_ptr)
|
||||
{
|
||||
cnode *node = ucp_table;
|
||||
register int cc = c;
|
||||
int case_offset;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
register int d = node->f1 | ((node->f0 & f0_chhmask) << 16);
|
||||
if (cc == d) break;
|
||||
if (cc < d)
|
||||
{
|
||||
if ((node->f0 & f0_leftexists) == 0) return -1;
|
||||
node ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
register int roffset = (node->f2 & f2_rightmask) >> f2_rightshift;
|
||||
if (roffset == 0) return -1;
|
||||
node += 1 << (roffset - 1);
|
||||
}
|
||||
}
|
||||
|
||||
switch ((*type_ptr = ((node->f0 & f0_typemask) >> f0_typeshift)))
|
||||
{
|
||||
case ucp_Cc:
|
||||
case ucp_Cf:
|
||||
case ucp_Cn:
|
||||
case ucp_Co:
|
||||
case ucp_Cs:
|
||||
return ucp_C;
|
||||
break;
|
||||
|
||||
case ucp_Ll:
|
||||
case ucp_Lu:
|
||||
case_offset = node->f2 & f2_casemask;
|
||||
if ((case_offset & 0x0100) != 0) case_offset |= 0xfffff000;
|
||||
*case_ptr = (case_offset == 0)? 0 : cc + case_offset;
|
||||
return ucp_L;
|
||||
|
||||
case ucp_Lm:
|
||||
case ucp_Lo:
|
||||
case ucp_Lt:
|
||||
*case_ptr = 0;
|
||||
return ucp_L;
|
||||
break;
|
||||
|
||||
case ucp_Mc:
|
||||
case ucp_Me:
|
||||
case ucp_Mn:
|
||||
return ucp_M;
|
||||
break;
|
||||
|
||||
case ucp_Nd:
|
||||
case ucp_Nl:
|
||||
case ucp_No:
|
||||
return ucp_N;
|
||||
break;
|
||||
|
||||
case ucp_Pc:
|
||||
case ucp_Pd:
|
||||
case ucp_Pe:
|
||||
case ucp_Pf:
|
||||
case ucp_Pi:
|
||||
case ucp_Ps:
|
||||
case ucp_Po:
|
||||
return ucp_P;
|
||||
break;
|
||||
|
||||
case ucp_Sc:
|
||||
case ucp_Sk:
|
||||
case ucp_Sm:
|
||||
case ucp_So:
|
||||
return ucp_S;
|
||||
break;
|
||||
|
||||
case ucp_Zl:
|
||||
case ucp_Zp:
|
||||
case ucp_Zs:
|
||||
return ucp_Z;
|
||||
break;
|
||||
|
||||
default: /* "Should never happen" */
|
||||
return -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ucp.c */
|
||||
@@ -0,0 +1,58 @@
|
||||
/*************************************************
|
||||
* libucp - Unicode Property Table handler *
|
||||
*************************************************/
|
||||
|
||||
/* These are the character categories that are returned by ucp_findchar */
|
||||
|
||||
enum {
|
||||
ucp_C, /* Other */
|
||||
ucp_L, /* Letter */
|
||||
ucp_M, /* Mark */
|
||||
ucp_N, /* Number */
|
||||
ucp_P, /* Punctuation */
|
||||
ucp_S, /* Symbol */
|
||||
ucp_Z /* Separator */
|
||||
};
|
||||
|
||||
/* These are the detailed character types that are returned by ucp_findchar */
|
||||
|
||||
enum {
|
||||
ucp_Cc, /* Control */
|
||||
ucp_Cf, /* Format */
|
||||
ucp_Cn, /* Unassigned */
|
||||
ucp_Co, /* Private use */
|
||||
ucp_Cs, /* Surrogate */
|
||||
ucp_Ll, /* Lower case letter */
|
||||
ucp_Lm, /* Modifier letter */
|
||||
ucp_Lo, /* Other letter */
|
||||
ucp_Lt, /* Title case letter */
|
||||
ucp_Lu, /* Upper case letter */
|
||||
ucp_Mc, /* Spacing mark */
|
||||
ucp_Me, /* Enclosing mark */
|
||||
ucp_Mn, /* Non-spacing mark */
|
||||
ucp_Nd, /* Decimal number */
|
||||
ucp_Nl, /* Letter number */
|
||||
ucp_No, /* Other number */
|
||||
ucp_Pc, /* Connector punctuation */
|
||||
ucp_Pd, /* Dash punctuation */
|
||||
ucp_Pe, /* Close punctuation */
|
||||
ucp_Pf, /* Final punctuation */
|
||||
ucp_Pi, /* Initial punctuation */
|
||||
ucp_Po, /* Other punctuation */
|
||||
ucp_Ps, /* Open punctuation */
|
||||
ucp_Sc, /* Currency symbol */
|
||||
ucp_Sk, /* Modifier symbol */
|
||||
ucp_Sm, /* Mathematical symbol */
|
||||
ucp_So, /* Other symbol */
|
||||
ucp_Zl, /* Line separator */
|
||||
ucp_Zp, /* Paragraph separator */
|
||||
ucp_Zs /* Space separator */
|
||||
};
|
||||
|
||||
/* For use in PCRE we make this function static so that there is no conflict if
|
||||
PCRE is linked with an application that makes use of an external version -
|
||||
assuming an external version is ever released... */
|
||||
|
||||
static int ucp_findchar(const int, int *, int *);
|
||||
|
||||
/* End of ucp.h */
|
||||
@@ -0,0 +1,91 @@
|
||||
/*************************************************
|
||||
* libucp - Unicode Property Table handler *
|
||||
*************************************************/
|
||||
|
||||
/* Internal header file defining the layout of compact nodes in the tree. */
|
||||
|
||||
typedef struct cnode {
|
||||
unsigned short int f0;
|
||||
unsigned short int f1;
|
||||
unsigned short int f2;
|
||||
} cnode;
|
||||
|
||||
/* Things for the f0 field */
|
||||
|
||||
#define f0_leftexists 0x8000 /* Left child exists */
|
||||
#define f0_typemask 0x3f00 /* Type bits */
|
||||
#define f0_typeshift 8 /* Type shift */
|
||||
#define f0_chhmask 0x00ff /* Character high bits */
|
||||
|
||||
/* Things for the f2 field */
|
||||
|
||||
#define f2_rightmask 0xf000 /* Mask for right offset bits */
|
||||
#define f2_rightshift 12 /* Shift for right offset */
|
||||
#define f2_casemask 0x0fff /* Mask for case offset */
|
||||
|
||||
/* The tree consists of a vector of structures of type cnode, with the root
|
||||
node as the first element. The three short ints (16-bits) are used as follows:
|
||||
|
||||
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
|
||||
is the next node in the vector.
|
||||
(2) The 0x4000 bits of f0 is spare.
|
||||
(3) The 0x3f00 bits of f0 contain the character type; this is a number
|
||||
defined by the enumeration in ucp.h (e.g. ucp_Lu).
|
||||
(4) The bottom 8 bits of f0 contain the most significant byte of the
|
||||
character's 24-bit codepoint.
|
||||
|
||||
(f1) (1) The f1 field contains the two least significant bytes of the
|
||||
codepoint.
|
||||
|
||||
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
|
||||
node. Otherwise, they contain one plus the exponent of the power of
|
||||
two of the offset to the right node (e.g. a value of 3 means 8). The
|
||||
units of the offset are node items.
|
||||
|
||||
(2) The 0x0fff bits of f2 contain the signed offset from this character to
|
||||
its alternate cased value. They are zero if there is no such
|
||||
character.
|
||||
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
|
||||
-----------------------------------------------------------------------------
|
||||
| | |
|
||||
| |-> spare |
|
||||
| exponent of right
|
||||
|-> left child exists child offset
|
||||
|
||||
|
||||
The upper/lower casing information is set only for characters that come in
|
||||
pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
|
||||
These are ignored. They are:
|
||||
|
||||
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
|
||||
2126 Ohm
|
||||
212A Kelvin
|
||||
212B Angstrom
|
||||
|
||||
Certainly for the last three, having an alternate case would seem to be a
|
||||
mistake. I don't know any Greek, so cannot comment on the first one.
|
||||
|
||||
|
||||
When searching the tree, proceed as follows:
|
||||
|
||||
(1) Start at the first node.
|
||||
|
||||
(2) Extract the character value from f1 and the bottom 8 bits of f0;
|
||||
|
||||
(3) Compare with the character being sought. If equal, we are done.
|
||||
|
||||
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
|
||||
not set, the character is not in the tree. If it is set, move to the next
|
||||
node, and go to (2).
|
||||
|
||||
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
|
||||
shift them right by f2_rightshift. If the result is zero, the character is
|
||||
not in the tree. Otherwise, calculate the number of nodes to skip by
|
||||
shifting the value 1 left by this number minus one. Go to (2).
|
||||
*/
|
||||
|
||||
|
||||
/* End of internal.h */
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,93 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
This is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language. See
|
||||
the file Tech.Notes for some information on the internals.
|
||||
|
||||
Written by: Philip Hazel <ph10@cam.ac.uk>
|
||||
|
||||
Copyright (c) 1997-2004 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains a table for translating Unicode property names into
|
||||
code values for the ucp_findchar function. It is in a separate module so that
|
||||
it can be included both in the main pcre library, and into pcretest (for
|
||||
printing out internals). */
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
int value;
|
||||
} ucp_type_table;
|
||||
|
||||
static ucp_type_table utt[] = {
|
||||
{ "C", 128 + ucp_C },
|
||||
{ "Cc", ucp_Cc },
|
||||
{ "Cf", ucp_Cf },
|
||||
{ "Cn", ucp_Cn },
|
||||
{ "Co", ucp_Co },
|
||||
{ "Cs", ucp_Cs },
|
||||
{ "L", 128 + ucp_L },
|
||||
{ "Ll", ucp_Ll },
|
||||
{ "Lm", ucp_Lm },
|
||||
{ "Lo", ucp_Lo },
|
||||
{ "Lt", ucp_Lt },
|
||||
{ "Lu", ucp_Lu },
|
||||
{ "M", 128 + ucp_M },
|
||||
{ "Mc", ucp_Mc },
|
||||
{ "Me", ucp_Me },
|
||||
{ "Mn", ucp_Mn },
|
||||
{ "N", 128 + ucp_N },
|
||||
{ "Nd", ucp_Nd },
|
||||
{ "Nl", ucp_Nl },
|
||||
{ "No", ucp_No },
|
||||
{ "P", 128 + ucp_P },
|
||||
{ "Pc", ucp_Pc },
|
||||
{ "Pd", ucp_Pd },
|
||||
{ "Pe", ucp_Pe },
|
||||
{ "Pf", ucp_Pf },
|
||||
{ "Pi", ucp_Pi },
|
||||
{ "Po", ucp_Po },
|
||||
{ "Ps", ucp_Ps },
|
||||
{ "S", 128 + ucp_S },
|
||||
{ "Sc", ucp_Sc },
|
||||
{ "Sk", ucp_Sk },
|
||||
{ "Sm", ucp_Sm },
|
||||
{ "So", ucp_So },
|
||||
{ "Z", 128 + ucp_Z },
|
||||
{ "Zl", ucp_Zl },
|
||||
{ "Zp", ucp_Zp },
|
||||
{ "Zs", ucp_Zs }
|
||||
};
|
||||
|
||||
/* End of ucptypetable.c */
|
||||
Reference in New Issue
Block a user