1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

ext/pcre: Add "/r" modifier (#13583)

Adds support for "Caseless restricted" matching added in PCRE2lib
10.43 with the "r" modifier.

This is `PCRE2_EXTRA_CASELESS_RESTRICT` in PCRE2. This is an "extra"
option, which means it is not possible to pass this option as
pcre2_compile() function parameter.

This option is passed in a pcre2_set_compile_extra_options() call.
Previously, these extra options are set at php_pcre_init_pcre2(),
but after this change, it is possible to customize the options
by adding bits to `eoptions` in pcre_get_compiled_regex_cache_ex().

The tests for this change are ported from upstream test suite[^1].

[^1]: https://github.com/PCRE2Project/pcre2/commit/c13d54f6581#diff-8c8312e4eb2d35bb16485404b7b5cc0eaef0bca1aa95ff5febf6a1890048305c
This commit is contained in:
Ayesh Karunaratne
2024-03-06 02:51:04 +07:00
committed by GitHub
parent 353d4ce075
commit 7b23470666
4 changed files with 112 additions and 0 deletions

View File

@@ -210,6 +210,10 @@ PHP 8.4 UPGRADE NOTES
As a consequence, LoongArch JIT support has been added, spaces
are now allowed between braces in Perl-compatible items, and
variable-length lookbehind assertions are now supported.
. Added support for the "r" (PCRE2_EXTRA_CASELESS_RESTRICT) modifier, as well
as the (?r) mode modifier. When enabled along with the case-insensitive
modifier ("i"), the expression locks out mixing of ASCII and non-ASCII
characters.
- PDO:
. Added support for driver-specific subclasses.

View File

@@ -185,6 +185,9 @@ PHP 8.4 INTERNALS UPGRADE NOTES
When flags should be ignored, pass 0 to the flags argument.
- php_pcre_match_impl() and pcre_get_compiled_regex_cache_ex() now use
proper boolean argument types instead of integer types.
- pcre_get_compiled_regex_cache_ex() now provides an option to collect extra
options (from modifiers used in the expression, for example), and calls
pcre2_set_compile_extra_options() with those options.
========================
4. OpCode changes

View File

@@ -592,6 +592,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
#else
uint32_t coptions = 0;
#endif
uint32_t eoptions = PHP_PCRE_DEFAULT_EXTRA_COPTIONS;
PCRE2_UCHAR error[128];
PCRE2_SIZE erroffset;
int errnumber;
@@ -722,6 +723,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
/* PCRE specific options */
case 'A': coptions |= PCRE2_ANCHORED; break;
case 'D': coptions |= PCRE2_DOLLAR_ENDONLY;break;
case 'r': eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break;
case 'S': /* Pass. */ break;
case 'X': /* Pass. */ break;
case 'U': coptions |= PCRE2_UNGREEDY; break;
@@ -776,6 +778,8 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
}
pcre2_set_character_tables(cctx, tables);
pcre2_set_compile_extra_options(cctx, eoptions);
/* Compile pattern and display a warning if compilation failed. */
re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);

View File

@@ -0,0 +1,101 @@
--TEST--
testing /r modifier in preg_* functions
--FILE--
<?php
echo "SK substitute matching" . PHP_EOL;
var_dump(preg_match('/AskZ/iur', 'AskZ')); // match
var_dump(preg_match('/AskZ/iur', 'aSKz')); // match
var_dump(preg_match('/AskZ/iur', "A\u{17f}kZ")); // no match
var_dump(preg_match('/AskZ/iur', "As\u{212a}Z")); // no match
var_dump(preg_match('/AskZ/iu', 'AskZ')); // match
var_dump(preg_match('/AskZ/iu', 'aSKz')); // match
var_dump(preg_match('/AskZ/iu', "A\u{17f}kZ")); // match
var_dump(preg_match('/AskZ/iu', "As\u{212a}Z")); // match
echo "K substitute matching" . PHP_EOL;
var_dump(preg_match('/k/iu', "\u{212A}"));
var_dump(preg_match('/k/iur', "\u{212A}"));
echo "non-ASCII in expressions" . PHP_EOL;
var_dump(preg_match('/A\x{17f}\x{212a}Z/iu', 'AskZ')); // match
var_dump(preg_match('/A\x{17f}\x{212a}Z/iur', 'AskZ')); // no match
echo "Character sets" . PHP_EOL;
var_dump(preg_match('/[AskZ]+/iur', 'AskZ')); // match
var_dump(preg_match('/[AskZ]+/iur', 'aSKz')); // match
var_dump(preg_match('/[AskZ]+/iur', "A\u{17f}kZ")); // match
var_dump(preg_match('/[AskZ]+/iur', "As\u{212a}Z")); // match
var_dump(preg_match('/[AskZ]+/iu', 'AskZ')); // match
var_dump(preg_match('/[AskZ]+/iu', 'aSKz')); // match
var_dump(preg_match('/[AskZ]+/iu', "A\u{17f}kZ")); // match
var_dump(preg_match('/[AskZ]+/iu', "As\u{212a}Z")); // match
echo "non-ASCII in character sets" . PHP_EOL;
var_dump(preg_match('/[\x{17f}\x{212a}]+/iur', 'AskZ')); // no match
var_dump(preg_match('/[\x{17f}\x{212a}]+/iu', 'AskZ')); // match
echo "Meta characters and negate character sets". PHP_EOL;
var_dump(preg_match('/[^s]+/iur', "A\u{17f}Z")); // match
var_dump(preg_match('/[^s]+/iu', "A\u{17f}Z")); // match
var_dump(preg_match('/[^s]+/iu', "A\u{17f}Z")); // match
var_dump(preg_match('/[^k]+/iur', "A\u{212a}Z")); // match
var_dump(preg_match('/[^k]+/iu', "A\u{212a}Z")); // match
var_dump(preg_match('/[^sk]+/iur', "A\u{17f}\u{212a}Z")); // match
var_dump(preg_match('/[^sk]+/iu', "A\u{17f}\u{212a}Z")); // match
var_dump(preg_match('/[^\x{17f}]+/iur', "AsSZ")); // match
var_dump(preg_match('/[^\x{17f}]+/iu', "AsSZ")); // match
echo "Modifier used within the expression" . PHP_EOL;
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}S\u{17f}S\u{17f}")); // match
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}\u{17f}\u{17f}S\u{17f}")); // no match
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}S\u{17f}\u{17f}\u{17f}")); // no match
var_dump(preg_match('/k(?^i)k/iur', "K\u{212a}")); // match
var_dump(preg_match('/k(?^i)k/iur', "\u{212a}\u{212a}")); // no match
echo "Done";
?>
--EXPECT--
SK substitute matching
int(1)
int(1)
int(0)
int(0)
int(1)
int(1)
int(1)
int(1)
K substitute matching
int(1)
int(0)
non-ASCII in expressions
int(1)
int(0)
Character sets
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
non-ASCII in character sets
int(0)
int(1)
Meta characters and negate character sets
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
int(1)
Modifier used within the expression
int(1)
int(0)
int(0)
int(1)
int(0)
Done