1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Test: grapheme_extract should slide properly past error bytes. (#17404)

grapheme_extract should slide properly past error bytes.

Adds a test to assert that the `$next` parameter of `grapheme_extract()`
points to the next byte offset in the input `$haystack` after accounting
for the moved offset, according to the docs:

> If offset does not point to the first byte of a UTF-8 character,
> the start position is moved to the next character boundary.

It seems that the existing behavior is to find the next grapheme
boundary from the original provided offset, but if the offset doesn’t
point to a valid starting byte, the assigned `$next` value will point
to the byte that was immediately decoded in the same call, leading to
possible infinite loops in user-space code.

```
while ( $at < strlen( $s ) ) {
        $grapheme = grapheme_extract( "\x85PHP", 1, GRAPHEME_EXTR_COUNT, $at, $at );
	// never moves past the second byte, always returns 'P'
}
```
This commit is contained in:
Dennis Snell
2025-01-17 08:12:41 +00:00
committed by GitHub
parent d0d8e6867a
commit f2d8420b1c
3 changed files with 8 additions and 0 deletions

View File

@@ -109,6 +109,10 @@ PHP 8.5 UPGRADE NOTES
. IntlDateFormatter::setTimeZone()/datefmt_set_timezone()
throws an IntlException on uninitialised classes/clone failures.
. grapheme_extract() properly assigns $next value when skipping over
invalid starting bytes. Previously there were cases where it would
point to the start of the grapheme boundary instead of the end.
- PCNTL:
. pcntl_exec() now has a formal return type of false.

View File

@@ -781,6 +781,7 @@ PHP_FUNCTION(grapheme_extract)
while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
pstr++;
start++;
if ( pstr >= str_end ) {
intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
"grapheme_extract: invalid input string", 0 );

View File

@@ -590,6 +590,8 @@ function ut_main()
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, 5, 11, "de" . $char_a_ring_nfd . "f" ),
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, -6, 11, "de" . $char_a_ring_nfd . "f" ),
array( "\x95\x00a\x85b", 1, 0, 2, "\x00" ),
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 3, $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd ),
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 2, $char_a_ring_nfd . $char_o_diaeresis_nfd ),
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . "c", 1, $char_a_ring_nfd . "" ),
@@ -1134,6 +1136,7 @@ extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte pos
extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte position -7 with $next = a%CC%8Ab == a%CC%8Ab $next=4 == 4
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position 5 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position -6 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
extract from "%95%00a%85b" "1" graphemes - grapheme_extract starting at byte position 0 with $next = %00 == %00 $next=2 == 2
extract from "a%CC%8Ao%CC%88o%CC%88" "3" graphemes - grapheme_extract = a%CC%8Ao%CC%88o%CC%88 == a%CC%8Ao%CC%88o%CC%88
extract from "a%CC%8Ao%CC%88o%CC%88" "2" graphemes - grapheme_extract = a%CC%8Ao%CC%88 == a%CC%8Ao%CC%88
extract from "a%CC%8Ao%CC%88c" "1" graphemes - grapheme_extract = a%CC%8A == a%CC%8A