Upgrade PCRE library to 5.0.

2026-04-25 08:58:28 +02:00 · 2005-05-27 18:07:33 +00:00
parent 8ce349b8e0
commit 1d019347cd
30 changed files with 20350 additions and 2095 deletions
@@ -1,6 +1,7 @@
 PHP                                                                        NEWS
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 ?? ??? 2005, PHP 5.0.5
+- Upgraded PCRE library to version 5.0. (Andrei)
 - Removed php_check_syntax() function which never worked properly. (Ilia)
 - Added new function mysqli_set_charset(). (Georg)
 - Added man pages for "phpize" and "php-config" scripts. (Jakub Vrana)
@@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,

 if test "$PHP_PCRE_REGEX" != "no"; then
  if test "$PHP_PCRE_REGEX" = "yes"; then
-    PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -I@ext_srcdir@/pcrelib)
+    PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -I@ext_srcdir@/pcrelib)
    PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
    AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
  else
@@ -50,7 +50,7 @@ if test "$PHP_PCRE_REGEX" != "no"; then
    
    AC_DEFINE(HAVE_PCRE, 1, [ ])
    PHP_ADD_INCLUDE($PCRE_INCDIR)
-    PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10)
+    PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000)
  fi
  PHP_SUBST(PCRE_SHARED_LIBADD)
 fi
@@ -3,4 +3,4 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
 University of Cambridge Computing Service,
 Cambridge, England. Phone: +44 1223 334714.

-Copyright (c) 1997-2003 University of Cambridge
+Copyright (c) 1997-2004 University of Cambridge
@@ -4,51 +4,42 @@ PCRE LICENCE
 PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.

+Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
+specified below. The documentation for PCRE, supplied in the "doc"
+directory, is distributed under the same terms as the software itself.
+
 Written by: Philip Hazel <ph10@cam.ac.uk>

 University of Cambridge Computing Service,
 Cambridge, England. Phone: +44 1223 334714.

-Copyright (c) 1997-2003 University of Cambridge
+Copyright (c) 1997-2004 University of Cambridge
+All rights reserved.

-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission. In practice, this means that if you use
-   PCRE in software that you distribute to others, commercially or
-   otherwise, you must put a sentence like this
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-     Regular expression support is provided by the PCRE library package,
-     which is open source software, written by Philip Hazel, and copyright
-     by the University of Cambridge, England.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-   somewhere reasonably visible in your documentation and in any relevant
-   files or online help data or similar. A reference to the ftp site for
-   the source, that is, to
-
-     ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
-
-   should also be given in the documentation. However, this condition is not
-   intended to apply to whole chains of software. If package A includes PCRE,
-   it must acknowledge it, but if package B is software that includes package
-   A, the condition is not imposed on package B (unless it uses PCRE
-   independently).
-
-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
-   then the terms of that licence shall supersede any condition above with
-   which it is incompatible.
-
-The documentation for PCRE, supplied in the "doc" directory, is distributed
-under the same terms as the software itself.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.

 End
@@ -1,6 +1,142 @@
 ChangeLog for PCRE
 ------------------

+Version 5.0 13-Sep-04
+---------------------
+
+ 1. Internal change: literal characters are no longer packed up into items
+    containing multiple characters in a single byte-string. Each character
+    is now matched using a separate opcode. However, there may be more than one
+    byte in the character in UTF-8 mode.
+
+ 2. The pcre_callout_block structure has two new fields: pattern_position and
+    next_item_length. These contain the offset in the pattern to the next match
+    item, and its length, respectively.
+
+ 3. The PCRE_AUTO_CALLOUT option for pcre_compile() requests the automatic
+    insertion of callouts before each pattern item. Added the /C option to
+    pcretest to make use of this.
+
+ 4. On the advice of a Windows user, the lines
+
+      #if defined(_WIN32) || defined(WIN32)
+      _setmode( _fileno( stdout ), 0x8000 );
+      #endif  /* defined(_WIN32) || defined(WIN32) */
+
+    have been added to the source of pcretest. This apparently does useful
+    magic in relation to line terminators.
+
+ 5. Changed "r" and "w" in the calls to fopen() in pcretest to "rb" and "wb"
+    for the benefit of those environments where the "b" makes a difference.
+
+ 6. The icc compiler has the same options as gcc, but "configure" doesn't seem
+    to know about it. I have put a hack into configure.in that adds in code
+    to set GCC=yes if CC=icc. This seems to end up at a point in the
+    generated configure script that is early enough to affect the setting of
+    compiler options, which is what is needed, but I have no means of testing
+    whether it really works. (The user who reported this had patched the
+    generated configure script, which of course I cannot do.)
+
+    LATER: After change 22 below (new libtool files), the configure script
+    seems to know about icc (and also ecc). Therefore, I have commented out
+    this hack in configure.in.
+
+ 7. Added support for pkg-config (2 patches were sent in).
+
+ 8. Negated POSIX character classes that used a combination of internal tables
+    were completely broken. These were [[:^alpha:]], [[:^alnum:]], and
+    [[:^ascii]]. Typically, they would match almost any characters. The other
+    POSIX classes were not broken in this way.
+
+ 9. Matching the pattern "\b.*?" against "ab cd", starting at offset 1, failed
+    to find the match, as PCRE was deluded into thinking that the match had to
+    start at the start point or following a newline. The same bug applied to
+    patterns with negative forward assertions or any backward assertions
+    preceding ".*" at the start, unless the pattern required a fixed first
+    character. This was a failing pattern: "(?!.bcd).*". The bug is now fixed.
+
+10. In UTF-8 mode, when moving forwards in the subject after a failed match
+    starting at the last subject character, bytes beyond the end of the subject
+    string were read.
+
+11. Renamed the variable "class" as "classbits" to make life easier for C++
+    users. (Previously there was a macro definition, but it apparently wasn't
+    enough.)
+
+12. Added the new field "tables" to the extra data so that tables can be passed
+    in at exec time, or the internal tables can be re-selected. This allows
+    a compiled regex to be saved and re-used at a later time by a different
+    program that might have everything at different addresses.
+
+13. Modified the pcre-config script so that, when run on Solaris, it shows a
+    -R library as well as a -L library.
+
+14. The debugging options of pcretest (-d on the command line or D on a
+    pattern) showed incorrect output for anything following an extended class
+    that contained multibyte characters and which was followed by a quantifier.
+
+15. Added optional support for general category Unicode character properties
+    via the \p, \P, and \X escapes. Unicode property support implies UTF-8
+    support. It adds about 90K to the size of the library. The meanings of the
+    inbuilt class escapes such as \d and \s have NOT been changed.
+
+16. Updated pcredemo.c to include calls to free() to release the memory for the
+    compiled pattern.
+
+17. The generated file chartables.c was being created in the source directory
+    instead of in the building directory. This caused the build to fail if the
+    source directory was different from the building directory, and was
+    read-only.
+
+18. Added some sample Win commands from Mark Tetrode into the NON-UNIX-USE
+    file. No doubt somebody will tell me if they don't make sense... Also added
+    Dan Mooney's comments about building on OpenVMS.
+
+19. Added support for partial matching via the PCRE_PARTIAL option for
+    pcre_exec() and the \P data escape in pcretest.
+
+20. Extended pcretest with 3 new pattern features:
+
+    (i)   A pattern option of the form ">rest-of-line" causes pcretest to
+          write the compiled pattern to the file whose name is "rest-of-line".
+          This is a straight binary dump of the data, with the saved pointer to
+          the character tables forced to be NULL. The study data, if any, is
+          written too. After writing, pcretest reads a new pattern.
+
+    (ii)  If, instead of a pattern, "<rest-of-line" is given, pcretest reads a
+          compiled pattern from the given file. There must not be any
+          occurrences of "<" in the file name (pretty unlikely); if there are,
+          pcretest will instead treat the initial "<" as a pattern delimiter.
+          After reading in the pattern, pcretest goes on to read data lines as
+          usual.
+
+    (iii) The F pattern option causes pcretest to flip the bytes in the 32-bit
+          and 16-bit fields in a compiled pattern, to simulate a pattern that
+          was compiled on a host of opposite endianness.
+
+21. The pcre-exec() function can now cope with patterns that were compiled on
+    hosts of opposite endianness, with this restriction:
+
+      As for any compiled expression that is saved and used later, the tables
+      pointer field cannot be preserved; the extra_data field in the arguments
+      to pcre_exec() should be used to pass in a tables address if a value
+      other than the default internal tables were used at compile time.
+
+22. Calling pcre_exec() with a negative value of the "ovecsize" parameter is
+    now diagnosed as an error. Previously, most of the time, a negative number
+    would have been treated as zero, but if in addition "ovector" was passed as
+    NULL, a crash could occur.
+
+23. Updated the files ltmain.sh, config.sub, config.guess, and aclocal.m4 with
+    new versions from the libtool 1.5 distribution (the last one is a copy of
+    a file called libtool.m4). This seems to have fixed the need to patch
+    "configure" to support Darwin 1.3 (which I used to do). However, I still
+    had to patch ltmain.sh to ensure that ${SED} is set (it isn't on my
+    workstation).
+
+24. Changed the PCRE licence to be the more standard "BSD" licence.
+
+
 Version 4.5 01-Dec-03
 ---------------------

@@ -4,51 +4,42 @@ PCRE LICENCE
 PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.

+Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
+specified below. The documentation for PCRE, supplied in the "doc"
+directory, is distributed under the same terms as the software itself.
+
 Written by: Philip Hazel <ph10@cam.ac.uk>

 University of Cambridge Computing Service,
 Cambridge, England. Phone: +44 1223 334714.

-Copyright (c) 1997-2003 University of Cambridge
+Copyright (c) 1997-2004 University of Cambridge
+All rights reserved.

-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission. In practice, this means that if you use
-   PCRE in software that you distribute to others, commercially or
-   otherwise, you must put a sentence like this
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-     Regular expression support is provided by the PCRE library package,
-     which is open source software, written by Philip Hazel, and copyright
-     by the University of Cambridge, England.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-   somewhere reasonably visible in your documentation and in any relevant
-   files or online help data or similar. A reference to the ftp site for
-   the source, that is, to
-
-     ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
-
-   should also be given in the documentation. However, this condition is not
-   intended to apply to whole chains of software. If package A includes PCRE,
-   it must acknowledge it, but if package B is software that includes package
-   A, the condition is not imposed on package B (unless it uses PCRE
-   independently).
-
-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
-   then the terms of that licence shall supersede any condition above with
-   which it is incompatible.
-
-The documentation for PCRE, supplied in the "doc" directory, is distributed
-under the same terms as the software itself.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.

 End
@@ -1,6 +1,53 @@
 News about PCRE releases
 ------------------------

+Release 5.0 13-Sep-04
+---------------------
+
+The licence under which PCRE is released has been changed to the more
+conventional "BSD" licence.
+
+In the code, some bugs have been fixed, and there are also some major changes
+in this release (which is why I've increased the number to 5.0). Some changes
+are internal rearrangements, and some provide a number of new facilities. The
+new features are:
+
+1. There's an "automatic callout" feature that inserts callouts before every
+   item in the regex, and there's a new callout field that gives the position
+   in the pattern - useful for debugging and tracing.
+
+2. The extra_data structure can now be used to pass in a set of character
+   tables at exec time. This is useful if compiled regex are saved and re-used
+   at a later time when the tables may not be at the same address. If the
+   default internal tables are used, the pointer saved with the compiled
+   pattern is now set to NULL, which means that you don't need to do anything
+   special unless you are using custom tables.
+
+3. It is possible, with some restrictions on the content of the regex, to
+   request "partial" matching. A special return code is given if all of the
+   subject string matched part of the regex. This could be useful for testing
+   an input field as it is being typed.
+
+4. There is now some optional support for Unicode character properties, which
+   means that the patterns items such as \p{Lu} and \X can now be used. Only
+   the general category properties are supported. If PCRE is compiled with this
+   support, an additional 90K data structure is include, which increases the
+   size of the library dramatically.
+
+5. There is support for saving compiled patterns and re-using them later.
+
+6. There is support for running regular expressions that were compiled on a
+   different host with the opposite endianness.
+
+7. The pcretest program has been extended to accommodate the new features.
+
+The main internal rearrangement is that sequences of literal characters are no
+longer handled as strings. Instead, each character is handled on its own. This
+makes some UTF-8 handling easier, and makes the support of partial matching
+possible. Compiled patterns containing long literal strings will be larger as a
+result of this change; I hope that performance will not be much affected.
+
+
 Release 4.5 01-Dec-03
 ---------------------

@@ -1,19 +1,25 @@
 Compiling PCRE on non-Unix systems
 ----------------------------------

-See below for comments on Cygwin or MinGW usage. I (Philip Hazel) have no
-knowledge of Windows sytems and how their libraries work. The items in the
-PCRE Makefile that relate to anything other than Unix-like systems have been
-contributed by PCRE users. There are some other comments and files in the
-Contrib directory on the ftp site that you may find useful.
+See below for comments on Cygwin or MinGW and OpenVMS usage. I (Philip Hazel)
+have no knowledge of Windows or VMS sytems and how their libraries work. The
+items in the PCRE Makefile that relate to anything other than Unix-like systems
+have been contributed by PCRE users. There are some other comments and files in
+the Contrib directory on the ftp site that you may find useful. See

-The following are generic comments about building PCRE:
+  ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib

 If you want to compile PCRE for a non-Unix system (or perhaps, more strictly,
-for a system that does not support "configure" and make files), note that PCRE
-consists entirely of code written in Standard C, and so should compile
-successfully on any machine with a Standard C compiler and library, using
-normal compiling commands to do the following:
+for a system that does not support "configure" and "make" files), note that
+PCRE consists entirely of code written in Standard C, and so should compile
+successfully on any system that has a Standard C compiler and library.
+
+
+GENERIC INSTRUCTIONS
+
+The following are generic comments about building PCRE. The interspersed
+indented commands are suggestions from Mark Tetrode as to which commands you
+might use on a Windows system to build a static library.

 (1) Copy or rename the file config.in as config.h, and change the macros that
 define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
@@ -23,32 +29,85 @@ particular, if you want to force a specific value for newline, you can define
 the NEWLINE macro. The default is to use '\n', thereby using whatever value
 your compiler gives to '\n'.

+  rem Mark Tetrode's commands
+  copy config.in config.h
+  rem Use write, because notepad cannot handle UNIX files. Change values.
+  write config.h
+
 (2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
 for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
 configure.in.

+  rem Mark Tetrode's commands
+  copy pcre.in pcre.h
+  rem Read values from configure.in
+  write configure.in
+  rem Change values
+  write pcre.h
+
 (3) Compile dftables.c as a stand-alone program, and then run it with
 the single argument "chartables.c". This generates a set of standard
 character tables and writes them to that file.

+  rem Mark Tetrode's commands
+  rem Compile & run
+  cl -DSUPPORT_UTF8 dftables.c
+  dftables.exe > chartables.c
+
 (4) Compile maketables.c, get.c, study.c and pcre.c and link them all
 together into an object library in whichever form your system keeps such
 libraries. This is the pcre library (chartables.c is included by means of an
 #include directive). If your system has static and shared libraries, you may
 have to do this once for each type.

+  rem Mark Tetrode's commands, for a static library
+  rem Compile & lib
+  cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c
+  lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj
+
 (5) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix
 library.

+  rem Mark Tetrode's commands, for a static library
+  rem Compile & lib
+  cl -DSUPPORT_UTF8 -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c
+  lib /OUT:pcreposix.lib pcreposix.obj
+
 (6) Compile the test program pcretest.c. This needs the functions in the
 pcre and pcreposix libraries when linking.

+  rem Mark Tetrode's commands
+  rem compile & link
+  cl pcretest.c pcre.lib pcreposix.lib
+
 (7) Run pcretest on the testinput files in the testdata directory, and check
 that the output matches the corresponding testoutput files. You must use the
 -i option when checking testinput2. Note that the supplied files are in Unix
 format, with just LF characters as line terminators. You may need to edit them
 to change this if your system uses a different convention.

+  rem Mark Tetrode's commands
+  rem Make a change, i.e. space, backspace, and save again - do this for all
+  rem to change UNIX to Win, \n to \n\r
+  write testoutput1
+  write testoutput2
+  write testoutput3
+  write testoutput4
+  write testoutput5
+  pcretest testdata\testinput1 testdata\myoutput1
+  windiff testdata\testoutput1 testdata\myoutput1
+  pcretest -i testdata\testinput2 testdata\myoutput2
+  windiff testdata\testoutput2 testdata\myoutput2
+  pcretest testdata\testinput3 testdata\myoutput3
+  windiff testdata\testoutput3 testdata\myoutput3
+  pcretest testdata\testinput4 testdata\myoutput4
+  windiff testdata\testoutput4 testdata\myoutput4
+  pcretest testdata\testinput5 testdata\myoutput5
+  windiff testdata\testoutput5 testdata\myoutput5
+
+
+FURTHER REMARKS
+
 If you have a system without "configure" but where you can use a Makefile, edit
 Makefile.in to create Makefile, substituting suitable values for the variables
 at the head of the file.
@@ -119,4 +178,67 @@ void  (*pcre_free)(void *) = free;
 #endif
 =========================

+
+BUILDING PCRE ON OPENVMS
+
+Dan Mooney sent the following comments about building PCRE on OpenVMS:
+
+"It was quite easy to compile and link the library. I don't have a formal
+make file but the attached file [reproduced below] contains the OpenVMS DCL
+commands I used to build the library. I had to add #define
+POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
+
+The library was built on:
+O/S: HP OpenVMS v7.3-1
+Compiler: Compaq C v6.5-001-48BCD
+Linker: vA13-01
+
+The test results did not match 100% due to the issues you mention in your
+documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
+modified some of the character tables temporarily and was able to get the
+results to match. Tests using the fr locale did not match since I don't have
+that locale loaded. The study size was always reported to be 3 less than the
+value in the standard test output files."
+
+=========================
+$! This DCL procedure builds PCRE on OpenVMS
+$!
+$! I followed the instructions in the non-unix-use file in the distribution.
+$!
+$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
+$ COMPILE DFTABLES.C
+$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
+$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
+$ COMPILE MAKETABLES.C
+$ COMPILE GET.C
+$ COMPILE STUDY.C
+$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
+$! did not seem to be defined anywhere.
+$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
+$ COMPILE PCRE.C
+$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
+$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
+$! did not seem to be defined anywhere.
+$ COMPILE PCREPOSIX.C
+$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
+$ COMPILE PCRETEST.C
+$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
+$! C programs that want access to command line arguments must be
+$! defined as a symbol
+$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
+$! Arguments must be enclosed in quotes.
+$ PCRETEST "-C"
+$! Test results:
+$!
+$!   The test results did not match 100%. The functions isprint(), iscntrl(),
+$!   isgraph() and ispunct() on OpenVMS must not produce the same results
+$!   as the system that built the test output files provided with the
+$!   distribution.
+$!
+$!   The study size did not match and was always 3 less on OpenVMS.
+$!
+$!   Locale could not be set to fr
+$!
+=========================
+
 ****
@@ -22,6 +22,28 @@ ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
 up the "real" POSIX functions of the same name.


+Documentation for PCRE
+----------------------
+
+If you install PCRE in the normal way, you will end up with an installed set of
+man pages whose names all start with "pcre". The one that is called "pcre"
+lists all the others. In addition to these man pages, the PCRE documentation is
+supplied in two other forms; however, as there is no standard place to install
+them, they are left in the doc directory of the unpacked source distribution.
+These forms are:
+
+  1. Files called doc/pcre.txt, doc/pcregrep.txt, and doc/pcretest.txt. The
+     first of these is a concatenation of the text forms of all the section 3
+     man pages except those that summarize individual functions. The other two
+     are the text forms of the section 1 man pages for the pcregrep and
+     pcretest commands. Text forms are provided for ease of scanning with text
+     editors or similar tools.
+
+  2. A subdirectory called doc/html contains all the documentation in HTML
+     form, hyperlinked in various ways, and rooted in a file called
+     doc/index.html.
+
+
 Contributions by users of PCRE
 ------------------------------

@@ -46,7 +68,7 @@ INSTALL.

 Most commonly, people build PCRE within its own distribution directory, and in
 this case, on many systems, just running "./configure" is sufficient, but the
-usual methods of changing standard defaults are available. For example,
+usual methods of changing standard defaults are available. For example:

 CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local

@@ -69,6 +91,13 @@ library. You can read more about them in the pcrebuild man page.
  for handling UTF-8 is not included in the library. (Even when included, it
  still has to be enabled by an option at run time.)

+. If, in addition to support for UTF-8 character strings, you want to include
+  support for the \P, \p, and \X sequences that recognize Unicode character
+  properties, you must add --enable-unicode-properties to the "configure"
+  command. This adds about 90K to the size of the library (in the form of a
+  property table); only the basic two-letter properties such as Lu are
+  supported.
+
 . You can build PCRE to recognized CR or NL as the newline character, instead
  of whatever your compiler uses for "\n", by adding --newline-is-cr or
  --newline-is-nl to the "configure" command, respectively. Only do this if you
@@ -111,12 +140,14 @@ library. You can read more about them in the pcrebuild man page.
  on the "configure" command. PCRE runs more slowly in this mode, but it may be
  necessary in environments with limited stack sizes.

-The "configure" script builds five files:
+The "configure" script builds seven files:

-. libtool is a script that builds shared and/or static libraries
+. pcre.h is build by copying pcre.in and making substitutions
 . Makefile is built by copying Makefile.in and making substitutions.
 . config.h is built by copying config.in and making substitutions.
 . pcre-config is built by copying pcre-config.in and making substitutions.
+. libpcre.pc is data for the pkg-config command, built from libpcre.pc.in
+. libtool is a script that builds shared and/or static libraries
 . RunTest is a script for running tests

 Once "configure" has run, you can run "make". It builds two libraries called
@@ -125,20 +156,33 @@ command. You can use "make install" to copy these, the public header files
 pcre.h and pcreposix.h, and the man pages to appropriate live directories on
 your system, in the normal way.

+
+Retrieving configuration information on Unix-like systems
+---------------------------------------------------------
+
 Running "make install" also installs the command pcre-config, which can be used
 to recall information about the PCRE configuration and installation. For
-example,
+example:

  pcre-config --version

 prints the version number, and

- pcre-config --libs
+  pcre-config --libs

 outputs information about where the library is installed. This command can be
 included in makefiles for programs that use PCRE, saving the programmer from
 having to remember too many details.

+The pkg-config command is another system for saving and retrieving information
+about installed libraries. Instead of separate commands for each library, a
+single command is used. For example:
+
+  pkg-config --cflags pcre
+
+The data is held in *.pc files that are installed in a directory called
+pkgconfig.
+

 Shared libraries on Unix-like systems
 -------------------------------------
@@ -158,7 +202,7 @@ installed themselves. However, the versions left in the source directory still
 use the uninstalled libraries.

 To build PCRE using static libraries only you must use --disable-shared when
-configuring it. For example
+configuring it. For example:

 ./configure --prefix=/usr/gnu --disable-shared

@@ -202,9 +246,9 @@ configuring process. (This can also be run by "make runtest", "make check", or
 The script runs the pcretest test program (which is documented in its own man
 page) on each of the testinput files (in the testdata directory) in turn,
 and compares the output with the contents of the corresponding testoutput file.
-A file called testtry is used to hold the output from pcretest. To run pcretest
-on just one of the test files, give its number as an argument to RunTest, for
-example:
+A file called testtry is used to hold the main output from pcretest
+(testsavedregex is also used as a working file). To run pcretest on just one of
+the test files, give its number as an argument to RunTest, for example:

  RunTest 2

@@ -247,19 +291,23 @@ running "configure". This file can be also fed directly to the perltest script,
 provided you are running Perl 5.8 or higher. (For Perl 5.6, a small patch,
 commented in the script, can be be used.)

-The fifth and final file tests error handling with UTF-8 encoding, and internal
-UTF-8 features of PCRE that are not relevant to Perl.
+The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
+features of PCRE that are not relevant to Perl.
+
+The sixth and final test checks the support for Unicode character properties.
+It it not run automatically unless PCRE is built with Unicode property support.
+To to this you must set --enable-unicode-properties when running "configure".


 Character tables
 ----------------

-PCRE uses four tables for manipulating and identifying characters. The final
-argument of the pcre_compile() function is a pointer to a block of memory
-containing the concatenated tables. A call to pcre_maketables() can be used to
-generate a set of tables in the current locale. If the final argument for
-pcre_compile() is passed as NULL, a set of default tables that is built into
-the binary is used.
+PCRE uses four tables for manipulating and identifying characters whose values
+are less than 256. The final argument of the pcre_compile() function is a
+pointer to a block of memory containing the concatenated tables. A call to
+pcre_maketables() can be used to generate a set of tables in the current
+locale. If the final argument for pcre_compile() is passed as NULL, a set of
+default tables that is built into the binary is used.

 The source file called chartables.c contains the default set of tables. This is
 not supplied in the distribution, but is built by the program dftables
@@ -299,12 +347,20 @@ The distribution should contain the following files:
    headers:

  dftables.c            auxiliary program for building chartables.c
+
  get.c                 )
  maketables.c          )
-  study.c               ) source of
-  pcre.c                )   the functions
+  study.c               ) source of the functions
+  pcre.c                )   in the library
  pcreposix.c           )
  printint.c            )
+
+  ucp.c                 )
+  ucp.h                 ) source for the code that is used for
+  ucpinternal.h         )   Unicode property handling
+  ucptable.c            )
+  ucptypetable.c        )
+
  pcre.in               "source" for the header for the external API; pcre.h
                          is built from this by "configure"
  pcreposix.h           header for the external POSIX wrapper API
@@ -335,7 +391,9 @@ The distribution should contain the following files:
  doc/pcretest.txt      plain text documentation of test program
  doc/perltest.txt      plain text documentation of Perl test program
  install-sh            a shell script for installing files
+  libpcre.pc.in         "source" for libpcre.pc for pkg-config
  ltmain.sh             file used to build a libtool script
+  mkinstalldirs         script for making install directories
  pcretest.c            comprehensive test program
  pcredemo.c            simple demonstration of coding calls to PCRE
  perltest              Perl test program
@@ -346,15 +404,19 @@ The distribution should contain the following files:
  testdata/testinput3   test data for locale-specific tests
  testdata/testinput4   test data for UTF-8 tests compatible with Perl
  testdata/testinput5   test data for other UTF-8 tests
+  testdata/testinput6   test data for Unicode property support tests
  testdata/testoutput1  test results corresponding to testinput1
  testdata/testoutput2  test results corresponding to testinput2
  testdata/testoutput3  test results corresponding to testinput3
  testdata/testoutput4  test results corresponding to testinput4
  testdata/testoutput5  test results corresponding to testinput5
+  testdata/testoutput6  test results corresponding to testinput6

 (C) Auxiliary files for Win32 DLL

  dll.mk
+  libpcre.def
+  libpcreposix.def
  pcre.def

 (D) Auxiliary file for VPASCAL
@@ -362,4 +424,4 @@ The distribution should contain the following files:
  makevp.bat

 Philip Hazel <ph10@cam.ac.uk>
-December 2003
+September 2004
@@ -11,26 +11,32 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
           Copyright (c) 1997-2004 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
-
-See the file Tech.Notes for some information on the internals.
 */


@@ -1,6 +1,9 @@
 Technical Notes about PCRE
 --------------------------

+Historical note 1
+-----------------
+
 Many years ago I implemented some regular expression functions to an algorithm
 suggested by Martin Richards. These were not Unix-like in form, and were quite
 restricted in what they could do by comparison with Perl. The interesting part
@@ -9,12 +12,15 @@ form of an expression was known in advance. The code to apply an expression did
 not operate by backtracking, as the original Henry Spencer code and current
 Perl code does, but instead checked all possibilities simultaneously by keeping
 a list of current states and checking all of them as it advanced through the
-subject string. (In the terminology of Jeffrey Friedl's book, it was a "DFA
-algorithm".) When the pattern was all used up, all remaining states were
+subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
+algorithm". When the pattern was all used up, all remaining states were
 possible matches, and the one matching the longest subset of the subject string
 was chosen. This did not necessarily maximize the individual wild portions of
 the pattern, as is expected in Unix and Perl-style regular expressions.

+Historical note 2
+-----------------
+
 By contrast, the code originally written by Henry Spencer and subsequently
 heavily modified for Perl actually compiles the expression twice: once in a
 dummy mode in order to find out how much store will be needed, and then for
@@ -23,6 +29,9 @@ optionally, minimizing in Perl) the amount of the subject that matches
 individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
 terminology.

+OK, here's the real stuff
+-------------------------
+
 For the set of functions that forms PCRE (which are unrelated to those
 mentioned above), I tried at first to invent an algorithm that used an amount
 of store bounded by a multiple of the number of characters in the pattern, to
@@ -38,8 +47,16 @@ got quite big anyway to handle all the Perl stuff.

 The compiled form of a pattern is a vector of bytes, containing items of
 variable length. The first byte in an item is an opcode, and the length of the
-item is either implicit in the opcode or contained in the data bytes which
-follow it. A list of all the opcodes follows:
+item is either implicit in the opcode or contained in the data bytes that
+follow it. 
+
+In many cases below "two-byte" data values are specified. This is in fact just
+a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
+performance). This is necessary only when patterns whose compiled length is
+greater than 64K are going to be processed. In this description, we assume the 
+"normal" compilation options.
+
+A list of all the opcodes follows:

 Opcodes with no following data
 ------------------------------
@@ -48,7 +65,7 @@ These items are all just one byte long

  OP_END                 end of pattern
  OP_ANY                 match any character
-  OP_ANYBYTE             match any single byte, even in UTF-8 mode 
+  OP_ANYBYTE             match any single byte, even in UTF-8 mode
  OP_SOD                 match start of data: \A
  OP_SOM,                start of match (subject + offset): \G
  OP_CIRC                ^ (start of data, or after \n in multiline)
@@ -63,13 +80,14 @@ These items are all just one byte long
  OP_EODN                match end of data or \n at end: \Z
  OP_EOD                 match end of data: \z
  OP_DOLL                $ (end of data, or before \n in multiline)
-
+  OP_EXTUNI              match an extended Unicode character 
+  

 Repeating single characters
 ---------------------------

-The common repeats (*, +, ?) when applied to a single character appear as
-two-byte items using the following opcodes:
+The common repeats (*, +, ?) when applied to a single character use the
+following opcodes:

  OP_STAR
  OP_MINSTAR
@@ -78,6 +96,7 @@ two-byte items using the following opcodes:
  OP_QUERY
  OP_MINQUERY

+In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
 Those with "MIN" in their name are the minimizing versions. Each is followed by
 the character that is to be repeated. Other repeats make use of

@@ -109,39 +128,52 @@ byte. The opcodes are:
  OP_TYPEEXACT


-Matching a character string
+Match by Unicode property
+-------------------------
+
+OP_PROP and OP_NOTPROP are used for positive and negative matches of a 
+character by testing its Unicode property (the \p and \P escape sequences).
+Each is followed by a single byte that encodes the desired property value.
+
+Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by two 
+bytes: OP_PROP or OP_NOTPROP and then the desired property value.
+
+
+Matching literal characters
 ---------------------------

-The OP_CHARS opcode is followed by a one-byte count and then that number of
-characters. If there are more than 255 characters in sequence, successive
-instances of OP_CHARS are used.
+The OP_CHAR opcode is followed by a single character that is to be matched 
+casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the 
+character may be more than one byte long. (Earlier versions of PCRE used 
+multi-character strings, but this was changed to allow some new features to be 
+added.)


 Character classes
 -----------------

-If there is only one character, OP_CHARS is used for a positive class,
-and OP_NOT for a negative one (that is, for something like [^a]). However, in 
-UTF-8 mode, this applies only to characters with values < 128, because OP_NOT 
-is confined to single bytes.
+If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
+class, and OP_NOT for a negative one (that is, for something like [^a]).
+However, in UTF-8 mode, the use of OP_NOT applies only to characters with
+values < 128, because OP_NOT is confined to single bytes.

 Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
 negated, single-character class. The normal ones (OP_STAR etc.) are used for a
 repeated positive single-character class.

 When there's more than one character in a class and all the characters are less
-than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative 
+than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
 one. In either case, the opcode is followed by a 32-byte bit map containing a 1
 bit for every character that is acceptable. The bits are counted from the least
 significant end of each byte.

-The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode, 
-subject characters with values greater than 256 can be handled correctly. For 
+The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
+subject characters with values greater than 256 can be handled correctly. For
 OP_CLASS they don't match, whereas for OP_NCLASS they do.

 For classes containing characters with values > 255, OP_XCLASS is used. It
 optionally uses a bit map (if any characters lie within it), followed by a list
-of pairs and single characters. There is a flag character than indicates 
+of pairs and single characters. There is a flag character than indicates
 whether it's a positive or a negative class.


@@ -192,14 +224,14 @@ the bracket itself. (They could have all been done like this, but I was making
 minimal changes.)

 A bracket opcode is followed by two bytes which give the offset to the next
-alternative OP_ALT or, if there aren't any branches, to the matching KET
+alternative OP_ALT or, if there aren't any branches, to the matching OP_KET
 opcode. Each OP_ALT is followed by two bytes giving the offset to the next one,
-or to the KET opcode.
+or to the OP_KET opcode.

 OP_KET is used for subpatterns that do not repeat indefinitely, while
 OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
 maximally respectively. All three are followed by two bytes giving (as a
-positive number) the offset back to the matching BRA opcode.
+positive number) the offset back to the matching OP_BRA opcode.

 If a subpattern is quantified such that it is permitted to match zero times, it
 is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
@@ -207,15 +239,14 @@ opcodes which tell the matcher that skipping this subpattern entirely is a
 valid branch.

 A subpattern with an indefinite maximum repetition is replicated in the
-compiled data its minimum number of times (or once with a BRAZERO if the
-minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
-appropriate.
+compiled data its minimum number of times (or once with OP_BRAZERO if the
+minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
+as appropriate.

 A subpattern with a bounded maximum repetition is replicated in a nested
-fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
-each replication after the minimum, so that, for example, (abc){2,5} is
-compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 99 and 200 bracket limits do
-not apply to these internally generated brackets.
+fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
+before each replication after the minimum, so that, for example, (abc){2,5} is
+compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.


 Assertions
@@ -260,8 +291,11 @@ from the start of the whole pattern.
 Callout
 -------

-OP_CALLOUT is followed by one byte of data that holds a callout number in the 
-range 0 to 255.
+OP_CALLOUT is followed by one byte of data that holds a callout number in the
+range 0 to 254 for manual callouts, or 255 for an automatic callout. In both 
+cases there follows a two-byte value giving the offset in the pattern to the
+start of the following item, and another two-byte item giving the length of the
+next item.


 Changing options
@@ -278,4 +312,4 @@ at compile time, and so does not cause anything to be put into the compiled
 data.

 Philip Hazel
-August 2003
+September 2004
@@ -9,32 +9,40 @@ the file Tech.Notes for some information on the internals.

 Written by: Philip Hazel <ph10@cam.ac.uk>

-           Copyright (c) 1997-2004 University of Cambridge
+           Copyright (c) 1997-2003 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

 /* This module contains some convenience functions for extracting substrings
 from the subject string after a regex match has succeeded. The original idea
-for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
+for these functions came from Scott Wimer. */


 /* Include the internals header, which itself includes Standard C headers plus
@@ -5,30 +5,38 @@

 /* This is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
+the file doc/Tech.Notes for some information on the internals.

 Written by: Philip Hazel <ph10@cam.ac.uk>

-           Copyright (c) 1997-2003 University of Cambridge
+           Copyright (c) 1997-2004 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

@@ -45,6 +53,18 @@ modules, but which are not relevant to the outside. */
 # include <php_config.h>
 #endif

+/* Standard C headers plus the external interface definition. The only time
+setjmp and stdarg are used is when NO_RECURSE is set. */
+
+#include <ctype.h>
+#include <limits.h>
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
 #ifndef PCRE_SPY
 #define PCRE_DEFINITION       /* Win32 __declspec(export) trigger for .dll */
 #endif
@@ -57,24 +77,45 @@ On Unix systems, "configure" can be used to override this default. */
 #define NEWLINE '\n'
 #endif

-/* The value of MATCH_LIMIT determines the default number of times the match()
-function can be called during a single execution of pcre_exec(). (There is a
-runtime method of setting a different limit.) The limit exists in order to
-catch runaway regular expressions that take for ever to determine that they do
-not match. The default is set very large so that it does not accidentally catch
-legitimate cases. On Unix systems, "configure" can be used to override this
-default default. */
-
-#ifndef MATCH_LIMIT
-#define MATCH_LIMIT 10000000
-#endif
-
 /* If you are compiling for a system that needs some magic to be inserted
 * before the definition of an exported function, define this macro to contain
 * the relevant magic. It apears at the start of every exported function. */
                                                                                                                                
 #define EXPORT

+/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
+cannot determine these outside the compilation (e.g. by running a program as
+part of "configure") because PCRE is often cross-compiled for use on other
+systems. Instead we make use of the maximum sizes that are available at
+preprocessor time in standard C environments. */
+
+#if USHRT_MAX == 65535
+  typedef unsigned short pcre_uint16;
+#elif UINT_MAX == 65535
+  typedef unsigned int pcre_uint16;
+#else
+  #error Cannot determine a type for 16-bit unsigned integers
+#endif
+
+#if UINT_MAX == 4294967295
+  typedef unsigned int pcre_uint32;
+#elif ULONG_MAX == 4294967295
+  typedef unsigned long int pcre_uint32;
+#else
+  #error Cannot determine a type for 32-bit unsigned integers
+#endif
+
+/* All character handling must be done as unsigned characters. Otherwise there
+are problems with top-bit-set characters and functions such as isspace().
+However, we leave the interface to the outside world as char *, because that
+should make things easier for callers. We define a short type for unsigned char
+to save lots of typing. I tried "uchar", but it causes problems on Digital
+Unix, where it is defined in sys/types, so use "uschar" instead. */
+
+typedef unsigned char uschar;
+
+/* Include the public PCRE header */
+
 #include "pcre.h"

 /* When compiling for use with the Virtual Pascal compiler, these functions
@@ -95,18 +136,6 @@ neither (there some non-Unix environments where this is the case). This assumes
 that all calls to memmove are moving strings upwards in store, which is the
 case in PCRE. */

-/* Standard C headers plus the external interface definition. The only time
-setjmp and stdarg are used is when NO_RECURSE is set. */
-
-#include <ctype.h>
-#include <limits.h>
-#include <setjmp.h>
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
 #if ! HAVE_MEMMOVE
 #undef  memmove        /* some systems may have a macro */
 #if HAVE_BCOPY
@@ -126,13 +155,14 @@ for (i = 0; i < n; ++i) *(--dest) =  *(--src);
 #endif   /* not VPCOMPAT */


-/* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
-These are used, for example, to link from the start of a subpattern to its
-alternatives and its end. The use of 2 bytes per offset limits the size of the
-compiled regex to around 64K, which is big enough for almost everybody.
-However, I received a request for an even bigger limit. For this reason, and
-also to make the code easier to maintain, the storing and loading of offsets
-from the byte string is now handled by the macros that are defined here.
+/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
+in big-endian order) by default. These are used, for example, to link from the
+start of a subpattern to its alternatives and its end. The use of 2 bytes per
+offset limits the size of the compiled regex to around 64K, which is big enough
+for almost everybody. However, I received a request for an even bigger limit.
+For this reason, and also to make the code easier to maintain, the storing and
+loading of offsets from the byte string is now handled by the macros that are
+defined here.

 The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
 the config.h file, but can be overridden by using -D on the command line. This
@@ -208,6 +238,7 @@ Standard C system should have one. */
 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
 #endif

+
 /* These are the public options that can change during matching. */

 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
@@ -216,12 +247,13 @@ Standard C system should have one. */
 but skip the top bit so we can use ints for convenience without getting tangled
 with negative values. The public options defined in pcre.h start at the least
 significant end. Make sure they don't overlap, though now that we have expanded
-to four bytes there is plenty of space. */
+to four bytes, there is plenty of space. */

 #define PCRE_FIRSTSET      0x40000000  /* first_byte is set */
 #define PCRE_REQCHSET      0x20000000  /* req_byte is set */
 #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
 #define PCRE_ICHANGED      0x08000000  /* i option changes within regex */
+#define PCRE_NOPARTIAL     0x04000000  /* can't use partial with this regex */

 /* Options for the "extra" block produced by pcre_study(). */

@@ -233,10 +265,11 @@ time, run time or study time, respectively. */
 #define PUBLIC_OPTIONS \
  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
-   PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
+   PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)

 #define PUBLIC_EXEC_OPTIONS \
-  (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
+  (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
+   PCRE_PARTIAL)

 #define PUBLIC_STUDY_OPTIONS 0   /* None defined */

@@ -296,12 +329,13 @@ definitions below, up to ESC_z. There's a dummy for OP_ANY because it
 corresponds to "." rather than an escape sequence. The final one must be
 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
 tests in the code for an escape greater than ESC_b and less than ESC_Z to
-detect the types that may be repeated. These are the types that consume a
-character. If any new escapes are put in between that don't consume a
+detect the types that may be repeated. These are the types that consume
+characters. If any new escapes are put in between that don't consume a
 character, that code will have to change. */

 enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
-       ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
+       ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
+       ESC_Q, ESC_REF };

 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
 contain UTF-8 characters with values greater than 255. */
@@ -312,6 +346,8 @@ contain UTF-8 characters with values greater than 255. */
 #define XCL_END       0    /* Marks end of individual items */
 #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */
 #define XCL_RANGE     2    /* A range (two multibyte chars) follows */
+#define XCL_PROP      3    /* Unicode property (one property code) follows */
+#define XCL_NOTPROP   4    /* Unicode inverted property (ditto) */


 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
@@ -337,100 +373,112 @@ enum {
  OP_WORDCHAR,           /* 10 \w */
  OP_ANY,            /* 11 Match any character */
  OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
-  OP_EODN,           /* 13 End of data or \n at end of data: \Z. */
-  OP_EOD,            /* 14 End of data: \z */
+  OP_NOTPROP,        /* 13 \P (not Unicode property) */
+  OP_PROP,           /* 14 \p (Unicode property) */
+  OP_EXTUNI,         /* 15 \X (extended Unicode sequence */
+  OP_EODN,           /* 16 End of data or \n at end of data: \Z. */
+  OP_EOD,            /* 17 End of data: \z */

-  OP_OPT,            /* 15 Set runtime options */
-  OP_CIRC,           /* 16 Start of line - varies with multiline switch */
-  OP_DOLL,           /* 17 End of line - varies with multiline switch */
-  OP_CHARS,          /* 18 Match string of characters */
-  OP_NOT,            /* 19 Match anything but the following char */
+  OP_OPT,            /* 18 Set runtime options */
+  OP_CIRC,           /* 19 Start of line - varies with multiline switch */
+  OP_DOLL,           /* 20 End of line - varies with multiline switch */
+  OP_CHAR,           /* 21 Match one character, casefully */
+  OP_CHARNC,         /* 22 Match one character, caselessly */
+  OP_NOT,            /* 23 Match anything but the following char */

-  OP_STAR,           /* 20 The maximizing and minimizing versions of */
-  OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */
-  OP_PLUS,           /* 22 the minimizing one second. */
-  OP_MINPLUS,        /* 23 This first set applies to single characters */
-  OP_QUERY,          /* 24 */
-  OP_MINQUERY,       /* 25 */
-  OP_UPTO,           /* 26 From 0 to n matches */
-  OP_MINUPTO,        /* 27 */
-  OP_EXACT,          /* 28 Exactly n matches */
+  OP_STAR,           /* 24 The maximizing and minimizing versions of */
+  OP_MINSTAR,        /* 25 all these opcodes must come in pairs, with */
+  OP_PLUS,           /* 26 the minimizing one second. */
+  OP_MINPLUS,        /* 27 This first set applies to single characters */
+  OP_QUERY,          /* 28 */
+  OP_MINQUERY,       /* 29 */
+  OP_UPTO,           /* 30 From 0 to n matches */
+  OP_MINUPTO,        /* 31 */
+  OP_EXACT,          /* 32 Exactly n matches */

-  OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */
-  OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */
-  OP_NOTPLUS,        /* 31 the minimizing one second. */
-  OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */
-  OP_NOTQUERY,       /* 33 */
-  OP_NOTMINQUERY,    /* 34 */
-  OP_NOTUPTO,        /* 35 From 0 to n matches */
-  OP_NOTMINUPTO,     /* 36 */
-  OP_NOTEXACT,       /* 37 Exactly n matches */
+  OP_NOTSTAR,        /* 33 The maximizing and minimizing versions of */
+  OP_NOTMINSTAR,     /* 34 all these opcodes must come in pairs, with */
+  OP_NOTPLUS,        /* 35 the minimizing one second. */
+  OP_NOTMINPLUS,     /* 36 This set applies to "not" single characters */
+  OP_NOTQUERY,       /* 37 */
+  OP_NOTMINQUERY,    /* 38 */
+  OP_NOTUPTO,        /* 39 From 0 to n matches */
+  OP_NOTMINUPTO,     /* 40 */
+  OP_NOTEXACT,       /* 41 Exactly n matches */

-  OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */
-  OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */
-  OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */
-  OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */
-  OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */
-  OP_TYPEMINQUERY,   /* 43 */
-  OP_TYPEUPTO,       /* 44 From 0 to n matches */
-  OP_TYPEMINUPTO,    /* 45 */
-  OP_TYPEEXACT,      /* 46 Exactly n matches */
+  OP_TYPESTAR,       /* 42 The maximizing and minimizing versions of */
+  OP_TYPEMINSTAR,    /* 43 all these opcodes must come in pairs, with */
+  OP_TYPEPLUS,       /* 44 the minimizing one second. These codes must */
+  OP_TYPEMINPLUS,    /* 45 be in exactly the same order as those above. */
+  OP_TYPEQUERY,      /* 46 This set applies to character types such as \d */
+  OP_TYPEMINQUERY,   /* 47 */
+  OP_TYPEUPTO,       /* 48 From 0 to n matches */
+  OP_TYPEMINUPTO,    /* 49 */
+  OP_TYPEEXACT,      /* 50 Exactly n matches */

-  OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */
-  OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */
-  OP_CRPLUS,         /* 49 the minimizing one second. These codes must */
-  OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */
-  OP_CRQUERY,        /* 51 These are for character classes and back refs */
-  OP_CRMINQUERY,     /* 52 */
-  OP_CRRANGE,        /* 53 These are different to the three seta above. */
-  OP_CRMINRANGE,     /* 54 */
+  OP_CRSTAR,         /* 51 The maximizing and minimizing versions of */
+  OP_CRMINSTAR,      /* 52 all these opcodes must come in pairs, with */
+  OP_CRPLUS,         /* 53 the minimizing one second. These codes must */
+  OP_CRMINPLUS,      /* 54 be in exactly the same order as those above. */
+  OP_CRQUERY,        /* 55 These are for character classes and back refs */
+  OP_CRMINQUERY,     /* 56 */
+  OP_CRRANGE,        /* 57 These are different to the three sets above. */
+  OP_CRMINRANGE,     /* 58 */

-  OP_CLASS,          /* 55 Match a character class, chars < 256 only */
-  OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative
+  OP_CLASS,          /* 59 Match a character class, chars < 256 only */
+  OP_NCLASS,         /* 60 Same, but the bitmap was created from a negative
                           class - the difference is relevant only when a UTF-8
                           character > 255 is encountered. */

-  OP_XCLASS,         /* 57 Extended class for handling UTF-8 chars within the
+  OP_XCLASS,         /* 61 Extended class for handling UTF-8 chars within the
                           class. This does both positive and negative. */

-  OP_REF,            /* 58 Match a back reference */
-  OP_RECURSE,        /* 59 Match a numbered subpattern (possibly recursive) */
-  OP_CALLOUT,        /* 60 Call out to external function if provided */
+  OP_REF,            /* 62 Match a back reference */
+  OP_RECURSE,        /* 63 Match a numbered subpattern (possibly recursive) */
+  OP_CALLOUT,        /* 64 Call out to external function if provided */

-  OP_ALT,            /* 61 Start of alternation */
-  OP_KET,            /* 62 End of group that doesn't have an unbounded repeat */
-  OP_KETRMAX,        /* 63 These two must remain together and in this */
-  OP_KETRMIN,        /* 64 order. They are for groups the repeat for ever. */
+  OP_ALT,            /* 65 Start of alternation */
+  OP_KET,            /* 66 End of group that doesn't have an unbounded repeat */
+  OP_KETRMAX,        /* 67 These two must remain together and in this */
+  OP_KETRMIN,        /* 68 order. They are for groups the repeat for ever. */

  /* The assertions must come before ONCE and COND */

-  OP_ASSERT,         /* 65 Positive lookahead */
-  OP_ASSERT_NOT,     /* 66 Negative lookahead */
-  OP_ASSERTBACK,     /* 67 Positive lookbehind */
-  OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
-  OP_REVERSE,        /* 69 Move pointer back - used in lookbehind assertions */
+  OP_ASSERT,         /* 69 Positive lookahead */
+  OP_ASSERT_NOT,     /* 70 Negative lookahead */
+  OP_ASSERTBACK,     /* 71 Positive lookbehind */
+  OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
+  OP_REVERSE,        /* 73 Move pointer back - used in lookbehind assertions */

  /* ONCE and COND must come after the assertions, with ONCE first, as there's
  a test for >= ONCE for a subpattern that isn't an assertion. */

-  OP_ONCE,           /* 70 Once matched, don't back up into the subpattern */
-  OP_COND,           /* 71 Conditional group */
-  OP_CREF,           /* 72 Used to hold an extraction string number (cond ref) */
+  OP_ONCE,           /* 74 Once matched, don't back up into the subpattern */
+  OP_COND,           /* 75 Conditional group */
+  OP_CREF,           /* 76 Used to hold an extraction string number (cond ref) */

-  OP_BRAZERO,        /* 73 These two must remain together and in this */
-  OP_BRAMINZERO,     /* 74 order. */
+  OP_BRAZERO,        /* 77 These two must remain together and in this */
+  OP_BRAMINZERO,     /* 78 order. */

-  OP_BRANUMBER,      /* 75 Used for extracting brackets whose number is greater
+  OP_BRANUMBER,      /* 79 Used for extracting brackets whose number is greater
                           than can fit into an opcode. */

-  OP_BRA             /* 76 This and greater values are used for brackets that
-                           extract substrings up to a basic limit. After that,
-                           use is made of OP_BRANUMBER. */
+  OP_BRA             /* 80 This and greater values are used for brackets that
+                           extract substrings up to EXTRACT_BASIC_MAX. After
+                           that, use is made of OP_BRANUMBER. */
 };

-/* WARNING: There is an implicit assumption in study.c that all opcodes are
-less than 128 in value. This makes handling UTF-8 character sequences easier.
-*/
+/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
+study.c that all opcodes are less than 128 in value. This makes handling UTF-8
+character sequences easier. */
+
+/* The highest extraction number before we have to start using additional
+bytes. (Originally PCRE didn't have support for extraction counts highter than
+this number.) The value is limited by the number of opcodes left after OP_BRA,
+i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
+opcodes. */
+
+#define EXTRACT_BASIC_MAX  100


 /* This macro defines textual names for all the opcodes. There are used only
@@ -439,8 +487,10 @@ macro is referenced only in printint.c. */

 #define OP_NAME_LIST \
  "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
-  "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \
-  "Opt", "^", "$", "chars", "not",                                \
+  "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte",                   \
+  "notprop", "prop", "extuni",                                    \
+  "\\Z", "\\z",                                                   \
+  "Opt", "^", "$", "char", "charnc", "not",                       \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
@@ -463,8 +513,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */
 #define OP_LENGTHS \
  1,                             /* End                                    */ \
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
-  1, 1, 1, 1, 2, 1, 1,           /* Any, Anybyte, \Z, \z, Opt, ^, $        */ \
-  2,                             /* Chars - the minimum length             */ \
+  1, 1,                          /* Any, Anybyte                           */ \
+  2, 2, 1,                       /* NOTPROP, PROP, EXTUNI                  */ \
+  1, 1, 2, 1, 1,                 /* \Z, \z, Opt, ^, $                      */ \
+  2,                             /* Char  - the minimum length             */ \
+  2,                             /* Charnc  - the minimum length           */ \
  2,                             /* not                                    */ \
  /* Positive single-char repeats                            ** These are  */ \
  2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** minima in  */ \
@@ -483,7 +536,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  0,                             /* XCLASS - variable length               */ \
  3,                             /* REF                                    */ \
  1+LINK_SIZE,                   /* RECURSE                                */ \
-  2,                             /* CALLOUT                                */ \
+  2+2*LINK_SIZE,                 /* CALLOUT                                */ \
  1+LINK_SIZE,                   /* Alt                                    */ \
  1+LINK_SIZE,                   /* Ket                                    */ \
  1+LINK_SIZE,                   /* KetRmax                                */ \
@@ -501,14 +554,6 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  1+LINK_SIZE                    /* BRA                                    */ \


-/* The highest extraction number before we have to start using additional
-bytes. (Originally PCRE didn't have support for extraction counts highter than
-this number.) The value is limited by the number of opcodes left after OP_BRA,
-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
-opcodes. */
-
-#define EXTRACT_BASIC_MAX  150
-
 /* A magic value for OP_CREF to indicate the "in recursion" condition. */

 #define CREF_RECURSE  0xffff
@@ -554,7 +599,7 @@ just to accommodate the POSIX wrapper. */
 #define ERR34 "character value in \\x{...} sequence is too large"
 #define ERR35 "invalid condition (?(0)"
 #define ERR36 "\\C not allowed in lookbehind assertion"
-#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
+#define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
 #define ERR38 "number after (?C is > 255"
 #define ERR39 "closing ) for (?C expected"
 #define ERR40 "recursive call could loop indefinitely"
@@ -562,37 +607,51 @@ just to accommodate the POSIX wrapper. */
 #define ERR42 "syntax error after (?P"
 #define ERR43 "two named groups have the same name"
 #define ERR44 "invalid UTF-8 string"
-
-/* All character handling must be done as unsigned characters. Otherwise there
-are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. We define a short type for unsigned char
-to save lots of typing. I tried "uchar", but it causes problems on Digital
-Unix, where it is defined in sys/types, so use "uschar" instead. */
-
-typedef unsigned char uschar;
+#define ERR45 "support for \\P, \\p, and \\X has not been compiled"
+#define ERR46 "malformed \\P or \\p sequence"
+#define ERR47 "unknown property name after \\P or \\p"

 /* The real format of the start of the pcre block; the index of names and the
-code vector run on as long as necessary after the end. */
+code vector run on as long as necessary after the end. We store an explicit
+offset to the name table so that if a regex is compiled on one host, saved, and
+then run on another where the size of pointers is different, all might still
+be well. For the case of compiled-on-4 and run-on-8, we include an extra
+pointer that is always NULL. For future-proofing, we also include a few dummy
+fields - even though you can never get this planning right!
+
+NOTE NOTE NOTE:
+Because people can now save and re-use compiled patterns, any additions to this
+structure should be made at the end, and something earlier (e.g. a new
+flag in the options or one of the dummy fields) should indicate that the new
+fields are present. Currently PCRE always sets the dummy fields to zero.
+NOTE NOTE NOTE:
+*/

 typedef struct real_pcre {
-  unsigned long int magic_number;
-  size_t size;                        /* Total that was malloced */
-  const unsigned char *tables;        /* Pointer to tables */
-  unsigned long int options;
-  unsigned short int top_bracket;
-  unsigned short int top_backref;
-  unsigned short int first_byte;
-  unsigned short int req_byte;
-  unsigned short int name_entry_size; /* Size of any name items; 0 => none */
-  unsigned short int name_count;      /* Number of name items */
+  pcre_uint32 magic_number;
+  pcre_uint32 size;               /* Total that was malloced */
+  pcre_uint32 options;
+  pcre_uint32 dummy1;             /* For future use, maybe */
+
+  pcre_uint16 top_bracket;
+  pcre_uint16 top_backref;
+  pcre_uint16 first_byte;
+  pcre_uint16 req_byte;
+  pcre_uint16 name_table_offset;  /* Offset to name table that follows */
+  pcre_uint16 name_entry_size;    /* Size of any name items */
+  pcre_uint16 name_count;         /* Number of name items */
+  pcre_uint16 dummy2;             /* For future use, maybe */
+
+  const unsigned char *tables;    /* Pointer to tables or NULL for std */
+  const unsigned char *nullpad;   /* NULL padding */
 } real_pcre;

-/* The format of the block used to store data from pcre_study(). */
+/* The format of the block used to store data from pcre_study(). The same
+remark (see NOTE above) about extending this structure applies. */

 typedef struct pcre_study_data {
-  size_t size;                        /* Total that was malloced */
-  uschar options;
+  pcre_uint32 size;               /* Total that was malloced */
+  pcre_uint32 options;
  uschar start_bits[32];
 } pcre_study_data;

@@ -605,12 +664,14 @@ typedef struct compile_data {
  const uschar *cbits;          /* Points to character type table */
  const uschar *ctypes;         /* Points to table of type maps */
  const uschar *start_code;     /* The start of the compiled code */
+  const uschar *start_pattern;  /* The start of the pattern */
  uschar *name_table;           /* The name/number table */
  int  names_found;             /* Number of entries so far */
  int  name_entry_size;         /* Size of each entry */
  int  top_backref;             /* Maximum back reference */
  unsigned int backref_map;     /* Bitmap of low back refs */
  int  req_varyopt;             /* "After variable item" flag for reqbyte */
+  BOOL nopartial;               /* Set TRUE if partial won't work */
 } compile_data;

 /* Structure for maintaining a chain of pointers to the currently incomplete
@@ -660,6 +721,8 @@ typedef struct match_data {
  BOOL   utf8;                  /* UTF8 flag */
  BOOL   endonly;               /* Dollar not before final \n */
  BOOL   notempty;              /* Empty string match not wanted */
+  BOOL   partial;               /* PARTIAL flag */
+  BOOL   hitend;                /* Hit the end of the subject at some point */
  const uschar *start_code;     /* For use when recursing */
  const uschar *start_subject;  /* Start of the subject string */
  const uschar *end_subject;    /* End of the subject string */
@@ -0,0 +1,19 @@
+LIBRARY libpcre
+EXPORTS
+pcre_malloc
+pcre_free
+pcre_config
+pcre_callout
+pcre_compile
+pcre_copy_substring
+pcre_exec
+pcre_get_substring
+pcre_get_stringnumber
+pcre_get_substring_list
+pcre_free_substring
+pcre_free_substring_list
+pcre_info
+pcre_fullinfo
+pcre_maketables
+pcre_study
+pcre_version
@@ -0,0 +1,24 @@
+LIBRARY libpcreposix
+EXPORTS
+pcre_malloc
+pcre_free
+pcre_config
+pcre_callout
+pcre_compile
+pcre_copy_substring
+pcre_exec
+pcre_get_substring
+pcre_get_stringnumber
+pcre_get_substring_list
+pcre_free_substring
+pcre_free_substring_list
+pcre_info
+pcre_fullinfo
+pcre_maketables
+pcre_study
+pcre_version
+
+regcomp
+regexec
+regerror
+regfree
@@ -8,29 +8,35 @@ and semantics are as close as possible to those of the Perl 5 language.

 Written by: Philip Hazel <ph10@cam.ac.uk>

-           Copyright (c) 1997-2004 University of Cambridge
+           Copyright (c) 1997-2003 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
-
-See the file Tech.Notes for some information on the internals.
 */


@@ -2,7 +2,39 @@
 *       Perl-Compatible Regular Expressions      *
 *************************************************/

-/* Copyright (c) 1997-2003 University of Cambridge */
+/* In its original form, this is the .in file that is transformed by
+"configure" into pcre.h.
+
+           Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/

 #ifndef _PCRE_H
 #define _PCRE_H
@@ -12,9 +44,9 @@ make changes to pcre.in. */

 #include "php_compat.h"

-#define PCRE_MAJOR          4
-#define PCRE_MINOR          5
-#define PCRE_DATE           01-December-2003
+#define PCRE_MAJOR          5
+#define PCRE_MINOR          0
+#define PCRE_DATE           13-Sep-2004

 /* Win32 uses DLL by default */

@@ -60,6 +92,8 @@ extern "C" {
 #define PCRE_UTF8               0x0800
 #define PCRE_NO_AUTO_CAPTURE    0x1000
 #define PCRE_NO_UTF8_CHECK      0x2000
+#define PCRE_AUTO_CALLOUT       0x4000
+#define PCRE_PARTIAL            0x8000

 /* Exec-time and get/set-time error codes */

@@ -74,6 +108,10 @@ extern "C" {
 #define PCRE_ERROR_CALLOUT         (-9)  /* Never used by PCRE itself */
 #define PCRE_ERROR_BADUTF8        (-10)
 #define PCRE_ERROR_BADUTF8_OFFSET (-11)
+#define PCRE_ERROR_PARTIAL        (-12)
+#define PCRE_ERROR_BADPARTIAL     (-13)
+#define PCRE_ERROR_INTERNAL       (-14)
+#define PCRE_ERROR_BADCOUNT       (-15)

 /* Request types for pcre_fullinfo() */

@@ -89,6 +127,7 @@ extern "C" {
 #define PCRE_INFO_NAMECOUNT          8
 #define PCRE_INFO_NAMETABLE          9
 #define PCRE_INFO_STUDYSIZE         10
+#define PCRE_INFO_DEFAULT_TABLES    11

 /* Request types for pcre_config() */

@@ -98,12 +137,14 @@ extern "C" {
 #define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD  3
 #define PCRE_CONFIG_MATCH_LIMIT             4
 #define PCRE_CONFIG_STACKRECURSE            5
+#define PCRE_CONFIG_UNICODE_PROPERTIES      6

 /* Bit flags for the pcre_extra structure */

 #define PCRE_EXTRA_STUDY_DATA          0x0001
 #define PCRE_EXTRA_MATCH_LIMIT         0x0002
 #define PCRE_EXTRA_CALLOUT_DATA        0x0004
+#define PCRE_EXTRA_TABLES              0x0008

 /* Types */

@@ -111,13 +152,15 @@ struct real_pcre;                 /* declaration; the definition is private  */
 typedef struct real_pcre pcre;

 /* The structure for passing additional data to pcre_exec(). This is defined in
-such as way as to be extensible. */
+such as way as to be extensible. Always add new fields at the end, in order to
+remain compatible. */

 typedef struct pcre_extra {
  unsigned long int flags;        /* Bits for which fields are set */
  void *study_data;               /* Opaque data from pcre_study() */
  unsigned long int match_limit;  /* Maximum number of calls to match() */
  void *callout_data;             /* Data passed back in callouts */
+  const unsigned char *tables;    /* Pointer to character tables */
 } pcre_extra;

 /* The structure for passing out data via the pcre_callout_function. We use a
@@ -133,10 +176,13 @@ typedef struct pcre_callout_block {
  const char  *subject;           /* The subject being matched */
  int          subject_length;    /* The length of the subject */
  int          start_match;       /* Offset to start of this match attempt */
-  int          current_position;  /* Where we currently are */
+  int          current_position;  /* Where we currently are in the subject */
  int          capture_top;       /* Max current capture */
  int          capture_last;      /* Most recently closed capture */
  void        *callout_data;      /* Data passed in with the call */
+  /* ------------------- Added for Version 1 -------------------------- */
+  int          pattern_position;  /* Offset to next item in the pattern */
+  int          next_item_length;  /* Length of next item in the pattern */
  /* ------------------------------------------------------------------ */
 } pcre_callout_block;

@@ -0,0 +1,324 @@
+/*************************************************
+*           PCRE DEMONSTRATION PROGRAM           *
+*************************************************/
+
+/* This is a demonstration program to illustrate the most straightforward ways
+of calling the PCRE regular expression library from a C program. See the
+pcresample documentation for a short discussion.
+
+Compile thuswise:
+  gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
+    -R/usr/local/lib -lpcre
+
+Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
+library files for PCRE are installed on your system. Only some operating
+systems (e.g. Solaris) use the -R option.
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <pcre.h>
+
+#define OVECCOUNT 30    /* should be a multiple of 3 */
+
+
+int main(int argc, char **argv)
+{
+pcre *re;
+const char *error;
+char *pattern;
+char *subject;
+unsigned char *name_table;
+int erroffset;
+int find_all;
+int namecount;
+int name_entry_size;
+int ovector[OVECCOUNT];
+int subject_length;
+int rc, i;
+
+
+/**************************************************************************
+* First, sort out the command line. There is only one possible option at  *
+* the moment, "-g" to request repeated matching to find all occurrences,  *
+* like Perl's /g option. We set the variable find_all to a non-zero value *
+* if the -g option is present. Apart from that, there must be exactly two *
+* arguments.                                                              *
+**************************************************************************/
+
+find_all = 0;
+for (i = 1; i < argc; i++)
+  {
+  if (strcmp(argv[i], "-g") == 0) find_all = 1;
+    else break;
+  }
+
+/* After the options, we require exactly two arguments, which are the pattern,
+and the subject string. */
+
+if (argc - i != 2)
+  {
+  printf("Two arguments required: a regex and a subject string\n");
+  return 1;
+  }
+
+pattern = argv[i];
+subject = argv[i+1];
+subject_length = (int)strlen(subject);
+
+
+/*************************************************************************
+* Now we are going to compile the regular expression pattern, and handle *
+* and errors that are detected.                                          *
+*************************************************************************/
+
+re = pcre_compile(
+  pattern,              /* the pattern */
+  0,                    /* default options */
+  &error,               /* for error message */
+  &erroffset,           /* for error offset */
+  NULL);                /* use default character tables */
+
+/* Compilation failed: print the error message and exit */
+
+if (re == NULL)
+  {
+  printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
+  return 1;
+  }
+
+
+/*************************************************************************
+* If the compilation succeeded, we call PCRE again, in order to do a     *
+* pattern match against the subject string. This does just ONE match. If *
+* further matching is needed, it will be done below.                     *
+*************************************************************************/
+
+rc = pcre_exec(
+  re,                   /* the compiled pattern */
+  NULL,                 /* no extra data - we didn't study the pattern */
+  subject,              /* the subject string */
+  subject_length,       /* the length of the subject */
+  0,                    /* start at offset 0 in the subject */
+  0,                    /* default options */
+  ovector,              /* output vector for substring information */
+  OVECCOUNT);           /* number of elements in the output vector */
+
+/* Matching failed: handle error cases */
+
+if (rc < 0)
+  {
+  switch(rc)
+    {
+    case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
+    /*
+    Handle other special cases if you like
+    */
+    default: printf("Matching error %d\n", rc); break;
+    }
+  free(re);     /* Release memory used for the compiled pattern */
+  return 1;
+  }
+
+/* Match succeded */
+
+printf("\nMatch succeeded at offset %d\n", ovector[0]);
+
+
+/*************************************************************************
+* We have found the first match within the subject string. If the output *
+* vector wasn't big enough, set its size to the maximum. Then output any *
+* substrings that were captured.                                         *
+*************************************************************************/
+
+/* The output vector wasn't big enough */
+
+if (rc == 0)
+  {
+  rc = OVECCOUNT/3;
+  printf("ovector only has room for %d captured substrings\n", rc - 1);
+  }
+
+/* Show substrings stored in the output vector by number. Obviously, in a real
+application you might want to do things other than print them. */
+
+for (i = 0; i < rc; i++)
+  {
+  char *substring_start = subject + ovector[2*i];
+  int substring_length = ovector[2*i+1] - ovector[2*i];
+  printf("%2d: %.*s\n", i, substring_length, substring_start);
+  }
+
+
+/**************************************************************************
+* That concludes the basic part of this demonstration program. We have    *
+* compiled a pattern, and performed a single match. The code that follows *
+* first shows how to access named substrings, and then how to code for    *
+* repeated matches on the same subject.                                   *
+**************************************************************************/
+
+/* See if there are any named substrings, and if so, show them by name. First
+we have to extract the count of named parentheses from the pattern. */
+
+(void)pcre_fullinfo(
+  re,                   /* the compiled pattern */
+  NULL,                 /* no extra data - we didn't study the pattern */
+  PCRE_INFO_NAMECOUNT,  /* number of named substrings */
+  &namecount);          /* where to put the answer */
+
+if (namecount <= 0) printf("No named substrings\n"); else
+  {
+  unsigned char *tabptr;
+  printf("Named substrings\n");
+
+  /* Before we can access the substrings, we must extract the table for
+  translating names to numbers, and the size of each entry in the table. */
+
+  (void)pcre_fullinfo(
+    re,                       /* the compiled pattern */
+    NULL,                     /* no extra data - we didn't study the pattern */
+    PCRE_INFO_NAMETABLE,      /* address of the table */
+    &name_table);             /* where to put the answer */
+
+  (void)pcre_fullinfo(
+    re,                       /* the compiled pattern */
+    NULL,                     /* no extra data - we didn't study the pattern */
+    PCRE_INFO_NAMEENTRYSIZE,  /* size of each entry in the table */
+    &name_entry_size);        /* where to put the answer */
+
+  /* Now we can scan the table and, for each entry, print the number, the name,
+  and the substring itself. */
+
+  tabptr = name_table;
+  for (i = 0; i < namecount; i++)
+    {
+    int n = (tabptr[0] << 8) | tabptr[1];
+    printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+      ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
+    tabptr += name_entry_size;
+    }
+  }
+
+
+/*************************************************************************
+* If the "-g" option was given on the command line, we want to continue  *
+* to search for additional matches in the subject string, in a similar   *
+* way to the /g option in Perl. This turns out to be trickier than you   *
+* might think because of the possibility of matching an empty string.    *
+* What happens is as follows:                                            *
+*                                                                        *
+* If the previous match was NOT for an empty string, we can just start   *
+* the next match at the end of the previous one.                         *
+*                                                                        *
+* If the previous match WAS for an empty string, we can't do that, as it *
+* would lead to an infinite loop. Instead, a special call of pcre_exec() *
+* is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first  *
+* of these tells PCRE that an empty string is not a valid match; other   *
+* possibilities must be tried. The second flag restricts PCRE to one     *
+* match attempt at the initial string position. If this match succeeds,  *
+* an alternative to the empty string match has been found, and we can    *
+* proceed round the loop.                                                *
+*************************************************************************/
+
+if (!find_all)
+  {
+  free(re);   /* Release the memory used for the compiled pattern */
+  return 0;   /* Finish unless -g was given */
+  }
+
+/* Loop for second and subsequent matches */
+
+for (;;)
+  {
+  int options = 0;                 /* Normally no options */
+  int start_offset = ovector[1];   /* Start at end of previous match */
+
+  /* If the previous match was for an empty string, we are finished if we are
+  at the end of the subject. Otherwise, arrange to run another match at the
+  same point to see if a non-empty match can be found. */
+
+  if (ovector[0] == ovector[1])
+    {
+    if (ovector[0] == subject_length) break;
+    options = PCRE_NOTEMPTY | PCRE_ANCHORED;
+    }
+
+  /* Run the next matching operation */
+
+  rc = pcre_exec(
+    re,                   /* the compiled pattern */
+    NULL,                 /* no extra data - we didn't study the pattern */
+    subject,              /* the subject string */
+    subject_length,       /* the length of the subject */
+    start_offset,         /* starting offset in the subject */
+    options,              /* options */
+    ovector,              /* output vector for substring information */
+    OVECCOUNT);           /* number of elements in the output vector */
+
+  /* This time, a result of NOMATCH isn't an error. If the value in "options"
+  is zero, it just means we have found all possible matches, so the loop ends.
+  Otherwise, it means we have failed to find a non-empty-string match at a
+  point where there was a previous empty-string match. In this case, we do what
+  Perl does: advance the matching position by one, and continue. We do this by
+  setting the "end of previous match" offset, because that is picked up at the
+  top of the loop as the point at which to start again. */
+
+  if (rc == PCRE_ERROR_NOMATCH)
+    {
+    if (options == 0) break;
+    ovector[1] = start_offset + 1;
+    continue;    /* Go round the loop again */
+    }
+
+  /* Other matching errors are not recoverable. */
+
+  if (rc < 0)
+    {
+    printf("Matching error %d\n", rc);
+    free(re);    /* Release memory used for the compiled pattern */
+    return 1;
+    }
+
+  /* Match succeded */
+
+  printf("\nMatch succeeded again at offset %d\n", ovector[0]);
+
+  /* The match succeeded, but the output vector wasn't big enough. */
+
+  if (rc == 0)
+    {
+    rc = OVECCOUNT/3;
+    printf("ovector only has room for %d captured substrings\n", rc - 1);
+    }
+
+  /* As before, show substrings stored in the output vector by number, and then
+  also any named substrings. */
+
+  for (i = 0; i < rc; i++)
+    {
+    char *substring_start = subject + ovector[2*i];
+    int substring_length = ovector[2*i+1] - ovector[2*i];
+    printf("%2d: %.*s\n", i, substring_length, substring_start);
+    }
+
+  if (namecount <= 0) printf("No named substrings\n"); else
+    {
+    unsigned char *tabptr = name_table;
+    printf("Named substrings\n");
+    for (i = 0; i < namecount; i++)
+      {
+      int n = (tabptr[0] << 8) | tabptr[1];
+      printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+        ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
+      tabptr += name_entry_size;
+      }
+    }
+  }      /* End of loop to find second and subsequent matches */
+
+printf("\n");
+free(re);       /* Release memory used for the compiled pattern */
+return 0;
+}
+
+/* End of pcredemo.c */
@@ -4,7 +4,38 @@

 /* This is a grep program that uses the PCRE regular expression library to do
 its pattern matching. On a Unix or Win32 system it can recurse into
-directories. */
+directories.
+
+           Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/

 #include <ctype.h>
 #include <stdio.h>
@@ -15,23 +15,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
           Copyright (c) 1997-2004 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

@@ -48,7 +56,7 @@ static const char *const estring[] = {
  ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
  ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR29, ERR29, ERR30,
  ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
-  ERR41, ERR42, ERR43, ERR44 };
+  ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };

 static const int eint[] = {
  REG_EESCAPE, /* "\\ at end of pattern" */
@@ -87,14 +95,17 @@ static const int eint[] = {
  REG_BADPAT,  /* "character value in \x{...} sequence is too large" */
  REG_BADPAT,  /* "invalid condition (?(0)" */
  REG_BADPAT,  /* "\\C not allowed in lookbehind assertion" */
-  REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" */
+  REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\U, or \\u" */
  REG_BADPAT,  /* "number after (?C is > 255" */
  REG_BADPAT,  /* "closing ) for (?C expected" */
  REG_BADPAT,  /* "recursive call could loop indefinitely" */
  REG_BADPAT,  /* "unrecognized character after (?P" */
  REG_BADPAT,  /* "syntax error after (?P" */
  REG_BADPAT,  /* "two named groups have the same name" */
-  REG_BADPAT   /* "invalid UTF-8 string" */
+  REG_BADPAT,  /* "invalid UTF-8 string" */
+  REG_BADPAT,  /* "support for \\P, \\p, and \\X has not been compiled" */
+  REG_BADPAT,  /* "malformed \\P or \\p sequence" */
+  REG_BADPAT   /* "unknown property name after \\P or \\p" */
 };

 /* Table of texts corresponding to POSIX error codes */
@@ -2,14 +2,43 @@
 *       Perl-Compatible Regular Expressions      *
 *************************************************/

-/* Copyright (c) 1997-2003 University of Cambridge */
-
 #ifndef _PCREPOSIX_H
 #define _PCREPOSIX_H

 /* This is the header for the POSIX wrapper interface to the PCRE Perl-
 Compatible Regular Expression library. It defines the things POSIX says should
-be there. I hope. */
+be there. I hope.
+
+            Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/

 /* Have to include stdlib.h in order to ensure that size_t is defined. */

@@ -4,7 +4,37 @@

 /* This program was hacked up as a tester for PCRE. I really should have
 written it more tidily in the first place. Will I ever learn? It has grown and
-been extended and consequently is now rather untidy in places. */
+been extended and consequently is now rather untidy in places.
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+

 #include <ctype.h>
 #include <stdio.h>
@@ -12,6 +42,7 @@ been extended and consequently is now rather untidy in places. */
 #include <stdlib.h>
 #include <time.h>
 #include <locale.h>
+#include <errno.h>

 /* We need the internal info for displaying the results of pcre_study(). Also
 for getting the opcodes for showing compiled code. */
@@ -35,9 +66,10 @@ Makefile. */
 #endif
 #endif

-#define LOOPREPEAT 50000
+#define LOOPREPEAT 500000

 #define BUFFER_SIZE 30000
+#define PBUFFER_SIZE BUFFER_SIZE
 #define DBUFFER_SIZE BUFFER_SIZE


@@ -52,6 +84,8 @@ static int show_malloc;
 static int use_utf8;
 static size_t gotten_store;

+static uschar *pbuffer = NULL;
+

 static const int utf8_table1[] = {
  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
@@ -71,10 +105,13 @@ static const int utf8_table3[] = {
 /* The code for doing this is held in a separate file that is also included in
 pcre.c when it is compiled with the debug switch. It defines a function called
 print_internals(), which uses a table of opcode lengths defined by the macro
-OP_LENGTHS, whose name must be OP_lengths. */
+OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
+Unicode property names to numbers; this is kept in a separate file. */

 static uschar OP_lengths[] = { OP_LENGTHS };

+#include "ucp.h"
+#include "ucptypetable.c"
 #include "printint.c"


@@ -269,7 +306,7 @@ data is not zero. */
 static int callout(pcre_callout_block *cb)
 {
 FILE *f = (first_callout | callout_extra)? outfile : NULL;
-int i, pre_start, post_start;
+int i, pre_start, post_start, subject_length;

 if (callout_extra)
  {
@@ -300,16 +337,26 @@ pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
  cb->current_position - cb->start_match, f);

+subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
+
 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
  cb->subject_length - cb->current_position, f);

 if (f != NULL) fprintf(f, "\n");

 /* Always print appropriate indicators, with callout number if not already
-shown */
+shown. For automatic callouts, show the pattern offset. */

-if (callout_extra) fprintf(outfile, "    ");
-  else fprintf(outfile, "%3d ", cb->callout_number);
+if (cb->callout_number == 255)
+  {
+  fprintf(outfile, "%+3d ", cb->pattern_position);
+  if (cb->pattern_position > 99) fprintf(outfile, "\n    ");
+  }
+else
+  {
+  if (callout_extra) fprintf(outfile, "    ");
+    else fprintf(outfile, "%3d ", cb->callout_number);
+  }

 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
 fprintf(outfile, "^");
@@ -320,6 +367,12 @@ if (post_start > 0)
  fprintf(outfile, "^");
  }

+for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
+  fprintf(outfile, " ");
+
+fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
+  pbuffer + cb->pattern_position);
+
 fprintf(outfile, "\n");
 first_callout = 0;

@@ -395,6 +448,23 @@ if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)



+/*************************************************
+*         Byte flipping function                 *
+*************************************************/
+
+static long int
+byteflip(long int value, int n)
+{
+if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
+return ((value & 0x000000ff) << 24) |
+       ((value & 0x0000ff00) <<  8) |
+       ((value & 0x00ff0000) >>  8) |
+       ((value & 0xff000000) >> 24);
+}
+
+
+
+
 /*************************************************
 *                Main Program                    *
 *************************************************/
@@ -429,8 +499,15 @@ when I am debugging. */

 buffer = (unsigned char *)malloc(BUFFER_SIZE);
 dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
+pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);

-/* Static so that new_malloc can use it. */
+/* The outfile variable is static so that new_malloc can use it. The _setmode()
+stuff is some magic that I don't understand, but which apparently does good
+things in Windows. It's related to line terminations.  */
+
+#if defined(_WIN32) || defined(WIN32)
+_setmode( _fileno( stdout ), 0x8000 );
+#endif  /* defined(_WIN32) || defined(WIN32) */

 outfile = stdout;

@@ -462,6 +539,8 @@ while (argc > 1 && argv[op][0] == '-')
    printf("Compiled with\n");
    (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
    printf("  %sUTF-8 support\n", rc? "" : "No ");
+    (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
+    printf("  %sUnicode properties support\n", rc? "" : "No ");
    (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
    printf("  Newline character is %s\n", (rc == '\r')? "CR" : "LF");
    (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
@@ -481,11 +560,12 @@ while (argc > 1 && argv[op][0] == '-')
    printf("  -C     show PCRE compile-time options and exit\n");
    printf("  -d     debug: show compiled code; implies -i\n"
           "  -i     show information about compiled pattern\n"
+           "  -m     output memory used information\n"
           "  -o <n> set size of offsets vector to <n>\n");
 #if !defined NOPOSIX
    printf("  -p     use POSIX interface\n");
 #endif
-    printf("  -s     output store information\n"
+    printf("  -s     output store (memory) used information\n"
           "  -t     time compilation and execution\n");
    return 1;
    }
@@ -508,7 +588,7 @@ if (offsets == NULL)

 if (argc > 1)
  {
-  infile = fopen(argv[op], "r");
+  infile = fopen(argv[op], "rb");
  if (infile == NULL)
    {
    printf("** Failed to open %s\n", argv[op]);
@@ -518,7 +598,7 @@ if (argc > 1)

 if (argc > 2)
  {
-  outfile = fopen(argv[op+1], "w");
+  outfile = fopen(argv[op+1], "wb");
  if (outfile == NULL)
    {
    printf("** Failed to open %s\n", argv[op+1]);
@@ -551,13 +631,17 @@ while (!done)

  const char *error;
  unsigned char *p, *pp, *ppp;
+  unsigned char *to_file = NULL;
  const unsigned char *tables = NULL;
+  unsigned long int true_size, true_study_size = 0;
+  size_t size, regex_gotten_store;
  int do_study = 0;
  int do_debug = debug;
  int do_G = 0;
  int do_g = 0;
  int do_showinfo = showinfo;
  int do_showrest = 0;
+  int do_flip = 0;
  int erroroffset, len, delimiter;

  use_utf8 = 0;
@@ -571,8 +655,93 @@ while (!done)
  while (isspace(*p)) p++;
  if (*p == 0) continue;

-  /* Get the delimiter and seek the end of the pattern; if is isn't
-  complete, read more. */
+  /* See if the pattern is to be loaded pre-compiled from a file. */
+
+  if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
+    {
+    unsigned long int magic;
+    uschar sbuf[8];
+    FILE *f;
+
+    p++;
+    pp = p + (int)strlen((char *)p);
+    while (isspace(pp[-1])) pp--;
+    *pp = 0;
+
+    f = fopen((char *)p, "rb");
+    if (f == NULL)
+      {
+      fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
+      continue;
+      }
+
+    if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
+
+    true_size =
+      (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
+    true_study_size =
+      (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
+
+    re = (real_pcre *)new_malloc(true_size);
+    regex_gotten_store = gotten_store;
+
+    if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
+
+    magic = ((real_pcre *)re)->magic_number;
+    if (magic != MAGIC_NUMBER)
+      {
+      if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
+        {
+        do_flip = 1;
+        }
+      else
+        {
+        fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
+        fclose(f);
+        continue;
+        }
+      }
+
+    fprintf(outfile, "Compiled regex%s loaded from %s\n",
+      do_flip? " (byte-inverted)" : "", p);
+
+    /* Need to know if UTF-8 for printing data strings */
+
+    new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
+    use_utf8 = (options & PCRE_UTF8) != 0;
+
+    /* Now see if there is any following study data */
+
+    if (true_study_size != 0)
+      {
+      pcre_study_data *psd;
+
+      extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
+      extra->flags = PCRE_EXTRA_STUDY_DATA;
+
+      psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
+      extra->study_data = psd;
+
+      if (fread(psd, 1, true_study_size, f) != true_study_size)
+        {
+        FAIL_READ:
+        fprintf(outfile, "Failed to read data from %s\n", p);
+        if (extra != NULL) new_free(extra);
+        if (re != NULL) new_free(re);
+        fclose(f);
+        continue;
+        }
+      fprintf(outfile, "Study data loaded from %s\n", p);
+      do_study = 1;     /* To get the data output if requested */
+      }
+    else fprintf(outfile, "No study data\n");
+
+    fclose(f);
+    goto SHOW_INFO;
+    }
+
+  /* In-line pattern (the usual case). Get the delimiter and seek the end of
+  the pattern; if is isn't complete, read more. */

  delimiter = *p++;

@@ -617,9 +786,11 @@ while (!done)

  if (pp[1] == '\\') *pp++ = '\\';

-  /* Terminate the pattern at the delimiter */
+  /* Terminate the pattern at the delimiter, and save a copy of the pattern
+  for callouts. */

  *pp++ = 0;
+  strcpy((char *)pbuffer, (char *)p);

  /* Look for options after final delimiter */

@@ -639,8 +810,10 @@ while (!done)

      case '+': do_showrest = 1; break;
      case 'A': options |= PCRE_ANCHORED; break;
+      case 'C': options |= PCRE_AUTO_CALLOUT; break;
      case 'D': do_debug = do_showinfo = 1; break;
      case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
+      case 'F': do_flip = 1; break;
      case 'G': do_G = 1; break;
      case 'I': do_showinfo = 1; break;
      case 'M': log_store = 1; break;
@@ -669,7 +842,15 @@ while (!done)
      pp = ppp;
      break;

+      case '>':
+      to_file = pp;
+      while (*pp != 0) pp++;
+      while (isspace(pp[-1])) pp--;
+      *pp = 0;
+      break;
+
      case '\n': case ' ': break;
+
      default:
      fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
      goto SKIP_DATA;
@@ -685,6 +866,7 @@ while (!done)
    {
    int rc;
    int cflags = 0;
+
    if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
    if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
    rc = regcomp(&preg, (char *)p, cflags);
@@ -759,14 +941,77 @@ while (!done)
              sizeof(real_pcre) -
              ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));

+    /* Extract the size for possible writing before possibly flipping it,
+    and remember the store that was got. */
+
+    true_size = ((real_pcre *)re)->size;
+    regex_gotten_store = gotten_store;
+
+    /* If /S was present, study the regexp to generate additional info to
+    help with the matching. */
+
+    if (do_study)
+      {
+      if (timeit)
+        {
+        register int i;
+        clock_t time_taken;
+        clock_t start_time = clock();
+        for (i = 0; i < LOOPREPEAT; i++)
+          extra = pcre_study(re, study_options, &error);
+        time_taken = clock() - start_time;
+        if (extra != NULL) free(extra);
+        fprintf(outfile, "  Study time %.3f milliseconds\n",
+          (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
+            (double)CLOCKS_PER_SEC);
+        }
+      extra = pcre_study(re, study_options, &error);
+      if (error != NULL)
+        fprintf(outfile, "Failed to study: %s\n", error);
+      else if (extra != NULL)
+        true_study_size = ((pcre_study_data *)(extra->study_data))->size;
+      }
+
+    /* If the 'F' option was present, we flip the bytes of all the integer
+    fields in the regex data block and the study block. This is to make it
+    possible to test PCRE's handling of byte-flipped patterns, e.g. those
+    compiled on a different architecture. */
+
+    if (do_flip)
+      {
+      real_pcre *rre = (real_pcre *)re;
+      rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
+      rre->size = byteflip(rre->size, sizeof(rre->size));
+      rre->options = byteflip(rre->options, sizeof(rre->options));
+      rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
+      rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
+      rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
+      rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
+      rre->name_table_offset = byteflip(rre->name_table_offset,
+        sizeof(rre->name_table_offset));
+      rre->name_entry_size = byteflip(rre->name_entry_size,
+        sizeof(rre->name_entry_size));
+      rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
+
+      if (extra != NULL)
+        {
+        pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
+        rsd->size = byteflip(rsd->size, sizeof(rsd->size));
+        rsd->options = byteflip(rsd->options, sizeof(rsd->options));
+        }
+      }
+
+    /* Extract information from the compiled data if required */
+
+    SHOW_INFO:
+
    if (do_showinfo)
      {
-      unsigned long int get_options;
+      unsigned long int get_options, all_options;
      int old_first_char, old_options, old_count;
      int count, backrefmax, first_char, need_char;
      int nameentrysize, namecount;
      const uschar *nametable;
-      size_t size;

      if (do_debug)
        {
@@ -802,9 +1047,9 @@ while (!done)
            get_options, old_options);
        }

-      if (size != gotten_store) fprintf(outfile,
+      if (size != regex_gotten_store) fprintf(outfile,
        "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
-        size, gotten_store);
+        size, regex_gotten_store);

      fprintf(outfile, "Capturing subpattern count = %d\n", count);
      if (backrefmax > 0)
@@ -822,6 +1067,18 @@ while (!done)
          }
        }

+      /* The NOPARTIAL bit is a private bit in the options, so we have
+      to fish it out via out back door */
+
+      all_options = ((real_pcre *)re)->options;
+      if (do_flip)
+        {
+        all_options = byteflip(all_options, sizeof(all_options));
+        }
+
+      if ((all_options & PCRE_NOPARTIAL) != 0)
+        fprintf(outfile, "Partial matching not supported\n");
+
      if (get_options == 0) fprintf(outfile, "No options\n");
        else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
          ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
@@ -871,77 +1128,103 @@ while (!done)
        else
          fprintf(outfile, "Need char = %d%s\n", ch, caseless);
        }
-      }
-
-    /* If /S was present, study the regexp to generate additional info to
-    help with the matching. */
-
-    if (do_study)
-      {
-      if (timeit)
-        {
-        register int i;
-        clock_t time_taken;
-        clock_t start_time = clock();
-        for (i = 0; i < LOOPREPEAT; i++)
-          extra = pcre_study(re, study_options, &error);
-        time_taken = clock() - start_time;
-        if (extra != NULL) free(extra);
-        fprintf(outfile, "  Study time %.3f milliseconds\n",
-          (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
-            (double)CLOCKS_PER_SEC);
-        }
-
-      extra = pcre_study(re, study_options, &error);
-      if (error != NULL)
-        fprintf(outfile, "Failed to study: %s\n", error);
-      else if (extra == NULL)
-        fprintf(outfile, "Study returned NULL\n");

      /* Don't output study size; at present it is in any case a fixed
      value, but it varies, depending on the computer architecture, and
-      so messes up the test suite. */
+      so messes up the test suite. (And with the /F option, it might be
+      flipped.) */

-      else if (do_showinfo)
+      if (do_study)
        {
-        size_t size;
-        uschar *start_bits = NULL;
-        new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
-        new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
-        /* fprintf(outfile, "Study size = %d\n", size); */
-        if (start_bits == NULL)
-          fprintf(outfile, "No starting character set\n");
+        if (extra == NULL)
+          fprintf(outfile, "Study returned NULL\n");
        else
          {
-          int i;
-          int c = 24;
-          fprintf(outfile, "Starting character set: ");
-          for (i = 0; i < 256; i++)
+          uschar *start_bits = NULL;
+          new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
+
+          if (start_bits == NULL)
+            fprintf(outfile, "No starting byte set\n");
+          else
            {
-            if ((start_bits[i/8] & (1<<(i%8))) != 0)
+            int i;
+            int c = 24;
+            fprintf(outfile, "Starting byte set: ");
+            for (i = 0; i < 256; i++)
              {
-              if (c > 75)
+              if ((start_bits[i/8] & (1<<(i&7))) != 0)
                {
-                fprintf(outfile, "\n  ");
-                c = 2;
-                }
-              if (isprint(i) && i != ' ')
-                {
-                fprintf(outfile, "%c ", i);
-                c += 2;
-                }
-              else
-                {
-                fprintf(outfile, "\\x%02x ", i);
-                c += 5;
+                if (c > 75)
+                  {
+                  fprintf(outfile, "\n  ");
+                  c = 2;
+                  }
+                if (isprint(i) && i != ' ')
+                  {
+                  fprintf(outfile, "%c ", i);
+                  c += 2;
+                  }
+                else
+                  {
+                  fprintf(outfile, "\\x%02x ", i);
+                  c += 5;
+                  }
                }
              }
+            fprintf(outfile, "\n");
            }
-          fprintf(outfile, "\n");
          }
        }
      }
-    }
+
+    /* If the '>' option was present, we write out the regex to a file, and
+    that is all. The first 8 bytes of the file are the regex length and then
+    the study length, in big-endian order. */
+
+    if (to_file != NULL)
+      {
+      FILE *f = fopen((char *)to_file, "wb");
+      if (f == NULL)
+        {
+        fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
+        }
+      else
+        {
+        uschar sbuf[8];
+        sbuf[0] = (true_size >> 24)  & 255;
+        sbuf[1] = (true_size >> 16)  & 255;
+        sbuf[2] = (true_size >>  8)  & 255;
+        sbuf[3] = (true_size)  & 255;
+
+        sbuf[4] = (true_study_size >> 24)  & 255;
+        sbuf[5] = (true_study_size >> 16)  & 255;
+        sbuf[6] = (true_study_size >>  8)  & 255;
+        sbuf[7] = (true_study_size)  & 255;
+
+        if (fwrite(sbuf, 1, 8, f) < 8 ||
+            fwrite(re, 1, true_size, f) < true_size)
+          {
+          fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
+          }
+        else
+          {
+          fprintf(outfile, "Compiled regex written to %s\n", to_file);
+          if (extra != NULL)
+            {
+            if (fwrite(extra->study_data, 1, true_study_size, f) <
+                true_study_size)
+              {
+              fprintf(outfile, "Write error on %s: %s\n", to_file,
+                strerror(errno));
+              }
+            else fprintf(outfile, "Study data written to %s\n", to_file);
+            }
+          }
+        fclose(f);
+        }
+      continue;  /* With next regex */
+      }
+    }        /* End of non-POSIX compile */

  /* Read data lines and test them */

@@ -1045,10 +1328,14 @@ while (!done)
          }
        break;

-        case 0:   /* Allows for an empty line */
+        case 0:   /* \ followed by EOF allows for an empty line */
        p--;
        continue;

+        case '>':
+        while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
+        continue;
+
        case 'A':  /* Option setting */
        options |= PCRE_ANCHORED;
        continue;
@@ -1159,6 +1446,10 @@ while (!done)
        if (n == 0) use_offsets = NULL;   /* Ensures it can't write to it */
        continue;

+        case 'P':
+        options |= PCRE_PARTIAL;
+        continue;
+
        case 'S':
        show_malloc = 1;
        continue;
@@ -1269,7 +1560,8 @@ while (!done)
            min = mid;
            mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
            }
-          else if (count >= 0 || count == PCRE_ERROR_NOMATCH)
+          else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
+                                 count == PCRE_ERROR_PARTIAL)
            {
            if (mid == min + 1)
              {
@@ -1305,8 +1597,11 @@ while (!done)
      /* The normal case is just to do the match once, with the default
      value of match_limit. */

-      else count = pcre_exec(re, extra, (char *)bptr, len,
-        start_offset, options | g_notempty, use_offsets, use_size_offsets);
+      else
+        {
+        count = pcre_exec(re, extra, (char *)bptr, len,
+          start_offset, options | g_notempty, use_offsets, use_size_offsets);
+        }

      if (count == 0)
        {
@@ -1393,6 +1688,14 @@ while (!done)
          }
        }

+      /* There was a partial match */
+
+      else if (count == PCRE_ERROR_PARTIAL)
+        {
+        fprintf(outfile, "Partial match\n");
+        break;  /* Out of the /g loop */
+        }
+
      /* Failed to match. If this is a /g or /G loop and we previously set
      g_notempty after a null match, this is not necessarily the end.
      We want to advance the start offset, and continue. In the case of UTF-8
@@ -12,23 +12,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
           Copyright (c) 1997-2004 University of Cambridge

 -----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-1. This software is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.

-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.

-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.

-4. If PCRE is embedded in any software that is released under the GNU
-   General Purpose Licence (GPL), then the terms of that licence shall
-   supersede any condition above with which it is incompatible.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

@@ -57,7 +65,7 @@ Returns:        nothing
 */

 static void
-set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
+set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
 {
 start_bits[c/8] |= (1 << (c&7));
 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
@@ -123,7 +131,7 @@ do
      /* Skip over callout */

      case OP_CALLOUT:
-      tcode += 2;
+      tcode += 2 + 2*LINK_SIZE;
      break;

      /* Skip over extended extraction bracket number */
@@ -186,11 +194,10 @@ do
      /* At least one single char sets the bit and stops */

      case OP_EXACT:       /* Fall through */
-      tcode++;
-
-      case OP_CHARS:       /* Fall through */
-      tcode++;
+      tcode += 2;

+      case OP_CHAR:
+      case OP_CHARNC:
      case OP_PLUS:
      case OP_MINPLUS:
      set_bit(start_bits, tcode[1], caseless, cd);
@@ -403,8 +410,9 @@ pcre_study(const pcre *external_re, int options, const char **errorptr)
 uschar start_bits[32];
 pcre_extra *extra;
 pcre_study_data *study;
+const uschar *tables;
 const real_pcre *re = (const real_pcre *)external_re;
-uschar *code = (uschar *)re + sizeof(real_pcre) +
+uschar *code = (uschar *)re + re->name_table_offset +
  (re->name_count * re->name_entry_size);
 compile_data compile_block;

@@ -429,12 +437,16 @@ at present. */
 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
  return NULL;

-/* Set the character tables in the block which is passed around */
+/* Set the character tables in the block that is passed around */

-compile_block.lcc = re->tables + lcc_offset;
-compile_block.fcc = re->tables + fcc_offset;
-compile_block.cbits = re->tables + cbits_offset;
-compile_block.ctypes = re->tables + ctypes_offset;
+tables = re->tables;
+if (tables == NULL)
+  (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
+
+compile_block.lcc = tables + lcc_offset;
+compile_block.fcc = tables + fcc_offset;
+compile_block.cbits = tables + cbits_offset;
+compile_block.ctypes = tables + ctypes_offset;

 /* See if we can find a fixed set of initial characters for the pattern. */

@@ -0,0 +1,151 @@
+/*************************************************
+*     libucp - Unicode Property Table handler    *
+*************************************************/
+
+/* This function provides a fast way of obtaining the basic Unicode properties
+of a character, using a compact binary tree that occupies less than 100K bytes.
+
+           Copyright (c) 2004 University of Cambridge
+
+-------------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+*/
+
+
+#include "ucp.h"               /* Exported interface */
+#include "ucpinternal.h"       /* Internal table details */
+#include "ucptable.c"          /* The table itself */
+
+
+
+/*************************************************
+*         Search table and return data           *
+*************************************************/
+
+/* Two values are returned: the category is ucp_C, ucp_L, etc. The detailed
+character type is ucp_Lu, ucp_Nd, etc.
+
+Arguments:
+  c           the character value
+  type_ptr    the detailed character type is returned here
+  case_ptr    for letters, the opposite case is returned here, if there
+                is one, else zero
+
+Returns:      the character type category or -1 if not found
+*/
+
+static int
+ucp_findchar(const int c, int *type_ptr, int *case_ptr)
+{
+cnode *node = ucp_table;
+register int cc = c;
+int case_offset;
+
+for (;;)
+  {
+  register int d = node->f1 | ((node->f0 & f0_chhmask) << 16);
+  if (cc == d) break;
+  if (cc < d)
+    {
+    if ((node->f0 & f0_leftexists) == 0) return -1;
+    node ++;
+    }
+  else
+    {
+    register int roffset = (node->f2 & f2_rightmask) >> f2_rightshift;
+    if (roffset == 0) return -1;
+    node += 1 << (roffset - 1);
+    }
+  }
+
+switch ((*type_ptr = ((node->f0 & f0_typemask) >> f0_typeshift)))
+  {
+  case ucp_Cc:
+  case ucp_Cf:
+  case ucp_Cn:
+  case ucp_Co:
+  case ucp_Cs:
+  return ucp_C;
+  break;
+
+  case ucp_Ll:
+  case ucp_Lu:
+  case_offset = node->f2 & f2_casemask;
+  if ((case_offset & 0x0100) != 0) case_offset |= 0xfffff000;
+  *case_ptr = (case_offset == 0)? 0 : cc + case_offset;
+  return ucp_L;
+
+  case ucp_Lm:
+  case ucp_Lo:
+  case ucp_Lt:
+  *case_ptr = 0;
+  return ucp_L;
+  break;
+
+  case ucp_Mc:
+  case ucp_Me:
+  case ucp_Mn:
+  return ucp_M;
+  break;
+
+  case ucp_Nd:
+  case ucp_Nl:
+  case ucp_No:
+  return ucp_N;
+  break;
+
+  case ucp_Pc:
+  case ucp_Pd:
+  case ucp_Pe:
+  case ucp_Pf:
+  case ucp_Pi:
+  case ucp_Ps:
+  case ucp_Po:
+  return ucp_P;
+  break;
+
+  case ucp_Sc:
+  case ucp_Sk:
+  case ucp_Sm:
+  case ucp_So:
+  return ucp_S;
+  break;
+
+  case ucp_Zl:
+  case ucp_Zp:
+  case ucp_Zs:
+  return ucp_Z;
+  break;
+
+  default:         /* "Should never happen" */
+  return -1;
+  break;
+  }
+}
+
+/* End of ucp.c */
@@ -0,0 +1,58 @@
+/*************************************************
+*     libucp - Unicode Property Table handler    *
+*************************************************/
+
+/* These are the character categories that are returned by ucp_findchar */
+
+enum {
+  ucp_C,     /* Other */
+  ucp_L,     /* Letter */
+  ucp_M,     /* Mark */
+  ucp_N,     /* Number */
+  ucp_P,     /* Punctuation */
+  ucp_S,     /* Symbol */
+  ucp_Z      /* Separator */
+};
+
+/* These are the detailed character types that are returned by ucp_findchar */
+
+enum {
+  ucp_Cc,    /* Control */
+  ucp_Cf,    /* Format */
+  ucp_Cn,    /* Unassigned */
+  ucp_Co,    /* Private use */
+  ucp_Cs,    /* Surrogate */
+  ucp_Ll,    /* Lower case letter */
+  ucp_Lm,    /* Modifier letter */
+  ucp_Lo,    /* Other letter */
+  ucp_Lt,    /* Title case letter */
+  ucp_Lu,    /* Upper case letter */
+  ucp_Mc,    /* Spacing mark */
+  ucp_Me,    /* Enclosing mark */
+  ucp_Mn,    /* Non-spacing mark */
+  ucp_Nd,    /* Decimal number */
+  ucp_Nl,    /* Letter number */
+  ucp_No,    /* Other number */
+  ucp_Pc,    /* Connector punctuation */
+  ucp_Pd,    /* Dash punctuation */
+  ucp_Pe,    /* Close punctuation */
+  ucp_Pf,    /* Final punctuation */
+  ucp_Pi,    /* Initial punctuation */
+  ucp_Po,    /* Other punctuation */
+  ucp_Ps,    /* Open punctuation */
+  ucp_Sc,    /* Currency symbol */
+  ucp_Sk,    /* Modifier symbol */
+  ucp_Sm,    /* Mathematical symbol */
+  ucp_So,    /* Other symbol */
+  ucp_Zl,    /* Line separator */
+  ucp_Zp,    /* Paragraph separator */
+  ucp_Zs     /* Space separator */
+};
+
+/* For use in PCRE we make this function static so that there is no conflict if
+PCRE is linked with an application that makes use of an external version -
+assuming an external version is ever released... */
+
+static int ucp_findchar(const int, int *, int *);
+
+/* End of ucp.h */
@@ -0,0 +1,91 @@
+/*************************************************
+*     libucp - Unicode Property Table handler    *
+*************************************************/
+
+/* Internal header file defining the layout of compact nodes in the tree. */
+
+typedef struct cnode {
+  unsigned short int f0;
+  unsigned short int f1;
+  unsigned short int f2;
+} cnode;
+
+/* Things for the f0 field */
+
+#define f0_leftexists   0x8000    /* Left child exists */
+#define f0_typemask     0x3f00    /* Type bits */
+#define f0_typeshift         8    /* Type shift */
+#define f0_chhmask      0x00ff    /* Character high bits */
+
+/* Things for the f2 field */
+
+#define f2_rightmask    0xf000    /* Mask for right offset bits */
+#define f2_rightshift       12    /* Shift for right offset */
+#define f2_casemask     0x0fff    /* Mask for case offset */
+
+/* The tree consists of a vector of structures of type cnode, with the root
+node as the first element. The three short ints (16-bits) are used as follows:
+
+(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
+         is the next node in the vector.
+     (2) The 0x4000 bits of f0 is spare.
+     (3) The 0x3f00 bits of f0 contain the character type; this is a number
+         defined by the enumeration in ucp.h (e.g. ucp_Lu).
+     (4) The bottom 8 bits of f0 contain the most significant byte of the
+         character's 24-bit codepoint.
+
+(f1) (1) The f1 field contains the two least significant bytes of the
+         codepoint.
+
+(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
+         node. Otherwise, they contain one plus the exponent of the power of
+         two of the offset to the right node (e.g. a value of 3 means 8). The
+         units of the offset are node items.
+
+     (2) The 0x0fff bits of f2 contain the signed offset from this character to
+         its alternate cased value. They are zero if there is no such
+         character.
+
+
+-----------------------------------------------------------------------------
+||.|.| type (6) | ms char (8) ||  ls char (16)  ||....|  case offset (12)  ||
+-----------------------------------------------------------------------------
+  | |                                              |
+  | |-> spare                                      |
+  |                                        exponent of right
+  |-> left child exists                       child offset
+
+
+The upper/lower casing information is set only for characters that come in
+pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
+These are ignored. They are:
+
+  1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
+  2126 Ohm
+  212A Kelvin
+  212B Angstrom
+
+Certainly for the last three, having an alternate case would seem to be a
+mistake. I don't know any Greek, so cannot comment on the first one.
+
+
+When searching the tree, proceed as follows:
+
+(1) Start at the first node.
+
+(2) Extract the character value from f1 and the bottom 8 bits of f0;
+
+(3) Compare with the character being sought. If equal, we are done.
+
+(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
+    not set, the character is not in the tree. If it is set, move to the next
+    node, and go to (2).
+
+(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
+    shift them right by f2_rightshift. If the result is zero, the character is
+    not in the tree. Otherwise, calculate the number of nodes to skip by
+    shifting the value 1 left by this number minus one. Go to (2).
+*/
+
+
+/* End of internal.h */
@@ -0,0 +1,93 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/*
+This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+           Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This module contains a table for translating Unicode property names into
+code values for the ucp_findchar function. It is in a separate module so that
+it can be included both in the main pcre library, and into pcretest (for
+printing out internals). */
+
+typedef struct {
+  const char *name;
+  int value;
+} ucp_type_table;
+
+static ucp_type_table utt[] = {
+  { "C",  128 + ucp_C },
+  { "Cc", ucp_Cc },
+  { "Cf", ucp_Cf },
+  { "Cn", ucp_Cn },
+  { "Co", ucp_Co },
+  { "Cs", ucp_Cs },
+  { "L",  128 + ucp_L },
+  { "Ll", ucp_Ll },
+  { "Lm", ucp_Lm },
+  { "Lo", ucp_Lo },
+  { "Lt", ucp_Lt },
+  { "Lu", ucp_Lu },
+  { "M",  128 + ucp_M },
+  { "Mc", ucp_Mc },
+  { "Me", ucp_Me },
+  { "Mn", ucp_Mn },
+  { "N",  128 + ucp_N },
+  { "Nd", ucp_Nd },
+  { "Nl", ucp_Nl },
+  { "No", ucp_No },
+  { "P",  128 + ucp_P },
+  { "Pc", ucp_Pc },
+  { "Pd", ucp_Pd },
+  { "Pe", ucp_Pe },
+  { "Pf", ucp_Pf },
+  { "Pi", ucp_Pi },
+  { "Po", ucp_Po },
+  { "Ps", ucp_Ps },
+  { "S",  128 + ucp_S },
+  { "Sc", ucp_Sc },
+  { "Sk", ucp_Sk },
+  { "Sm", ucp_Sm },
+  { "So", ucp_So },
+  { "Z",  128 + ucp_Z },
+  { "Zl", ucp_Zl },
+  { "Zp", ucp_Zp },
+  { "Zs", ucp_Zs }
+};
+
+/* End of ucptypetable.c */