From c9e78e6d338cc46dcadb39b3e2df119fa969e72b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 7 Feb 2020 16:39:06 +0100 Subject: [PATCH 1/2] PCRE: Check whether start offset is on char boundary We need not just the whole string to be UTF-8, but the start position to be on a character boundary as well. Check this by looking for a continuation byte. --- ext/pcre/php_pcre.c | 18 +++++++++++++++++- ext/pcre/tests/bug79241.phpt | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 ext/pcre/tests/bug79241.phpt diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 481d564f66b..104b8d4c975 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -1130,6 +1130,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ * } /* }}} */ +static zend_always_inline zend_bool is_known_valid_utf8( + zend_string *subject_str, PCRE2_SIZE start_offset) { + if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) { + /* We don't know whether the string is valid UTF-8 or not. */ + return 0; + } + + if (start_offset == ZSTR_LEN(subject_str)) { + /* Degenerate case: Offset points to end of string. */ + return 1; + } + + /* Check that the offset does not point to an UTF-8 continuation byte. */ + return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80; +} + /* {{{ php_pcre_match_impl() */ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset) @@ -1247,7 +1263,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, } } - options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8) + options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2) ? 0 : PCRE2_NO_UTF_CHECK; /* Execute the regular expression. */ diff --git a/ext/pcre/tests/bug79241.phpt b/ext/pcre/tests/bug79241.phpt new file mode 100644 index 00000000000..92e5253735f --- /dev/null +++ b/ext/pcre/tests/bug79241.phpt @@ -0,0 +1,22 @@ +--TEST-- +Bug #79241: Segmentation fault on preg_match() +--FILE-- + +--EXPECT-- +int(0) +bool(false) +bool(true) From cd5591a28d738b1b00c96c0e6cae91b490dba56d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 7 Feb 2020 17:01:39 +0100 Subject: [PATCH 2/2] PCRE: Only remember valid UTF-8 if start offset zero PCRE only validates the string starting from the start offset (minus maximum look-behind, but let's ignore that), so we can only remember that the string is fully valid UTF-8 is the original start offset is zero. --- NEWS | 1 + ext/pcre/php_pcre.c | 11 +++++++---- ext/pcre/tests/bug79241.phpt | 11 +++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 26885fae720..a08c2d77a9c 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,7 @@ PHP NEWS - PCRE: . Fixed bug #79188 (Memory corruption in preg_replace/preg_replace_callback and unicode). (Nikita) + . Fixed bug #79241 (Segmentation fault on preg_match()). (Nikita) ?? ??? ????, PHP 7.4.3 diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 104b8d4c975..c50bd2fba22 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -1167,7 +1167,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, PCRE2_SPTR mark = NULL; /* Target for MARK name */ zval marks; /* Array of marks for PREG_PATTERN_ORDER */ pcre2_match_data *match_data; - PCRE2_SIZE start_offset2; + PCRE2_SIZE start_offset2, orig_start_offset; char *subject = ZSTR_VAL(subject_str); size_t subject_len = ZSTR_LEN(subject_str); @@ -1263,8 +1263,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, } } - options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2) - ? 0 : PCRE2_NO_UTF_CHECK; + orig_start_offset = start_offset2; + options = + (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset) + ? 0 : PCRE2_NO_UTF_CHECK; /* Execute the regular expression. */ #ifdef HAVE_PCRE_JIT_SUPPORT @@ -1454,7 +1456,8 @@ error: if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) { /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */ - if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) { + if ((pce->compile_options & PCRE2_UTF) + && !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) { GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8); } diff --git a/ext/pcre/tests/bug79241.phpt b/ext/pcre/tests/bug79241.phpt index 92e5253735f..f6dbb8bea4e 100644 --- a/ext/pcre/tests/bug79241.phpt +++ b/ext/pcre/tests/bug79241.phpt @@ -15,8 +15,19 @@ var_dump(preg_match($pattern, $text, $matches, 0, 0)); var_dump(preg_match($pattern, $text, $matches, 0, 1)); var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR); +echo "\n"; + +$text = "VA\xff"; $text .= "LID"; +var_dump(preg_match($pattern, $text, $matches, 0, 4)); +var_dump(preg_match($pattern, $text, $matches, 0, 0)); +var_dump(preg_last_error() == PREG_BAD_UTF8_ERROR); + ?> --EXPECT-- int(0) bool(false) bool(true) + +int(1) +bool(false) +bool(true)