mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Test: grapheme_extract should slide properly past error bytes. (#17404)
grapheme_extract should slide properly past error bytes.
Adds a test to assert that the `$next` parameter of `grapheme_extract()`
points to the next byte offset in the input `$haystack` after accounting
for the moved offset, according to the docs:
> If offset does not point to the first byte of a UTF-8 character,
> the start position is moved to the next character boundary.
It seems that the existing behavior is to find the next grapheme
boundary from the original provided offset, but if the offset doesn’t
point to a valid starting byte, the assigned `$next` value will point
to the byte that was immediately decoded in the same call, leading to
possible infinite loops in user-space code.
```
while ( $at < strlen( $s ) ) {
$grapheme = grapheme_extract( "\x85PHP", 1, GRAPHEME_EXTR_COUNT, $at, $at );
// never moves past the second byte, always returns 'P'
}
```
This commit is contained in:
@@ -109,6 +109,10 @@ PHP 8.5 UPGRADE NOTES
|
||||
. IntlDateFormatter::setTimeZone()/datefmt_set_timezone()
|
||||
throws an IntlException on uninitialised classes/clone failures.
|
||||
|
||||
. grapheme_extract() properly assigns $next value when skipping over
|
||||
invalid starting bytes. Previously there were cases where it would
|
||||
point to the start of the grapheme boundary instead of the end.
|
||||
|
||||
- PCNTL:
|
||||
. pcntl_exec() now has a formal return type of false.
|
||||
|
||||
|
||||
@@ -781,6 +781,7 @@ PHP_FUNCTION(grapheme_extract)
|
||||
|
||||
while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
|
||||
pstr++;
|
||||
start++;
|
||||
if ( pstr >= str_end ) {
|
||||
intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
|
||||
"grapheme_extract: invalid input string", 0 );
|
||||
|
||||
@@ -590,6 +590,8 @@ function ut_main()
|
||||
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, 5, 11, "de" . $char_a_ring_nfd . "f" ),
|
||||
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, -6, 11, "de" . $char_a_ring_nfd . "f" ),
|
||||
|
||||
array( "\x95\x00a\x85b", 1, 0, 2, "\x00" ),
|
||||
|
||||
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 3, $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd ),
|
||||
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 2, $char_a_ring_nfd . $char_o_diaeresis_nfd ),
|
||||
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . "c", 1, $char_a_ring_nfd . "" ),
|
||||
@@ -1134,6 +1136,7 @@ extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte pos
|
||||
extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte position -7 with $next = a%CC%8Ab == a%CC%8Ab $next=4 == 4
|
||||
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position 5 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
|
||||
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position -6 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
|
||||
extract from "%95%00a%85b" "1" graphemes - grapheme_extract starting at byte position 0 with $next = %00 == %00 $next=2 == 2
|
||||
extract from "a%CC%8Ao%CC%88o%CC%88" "3" graphemes - grapheme_extract = a%CC%8Ao%CC%88o%CC%88 == a%CC%8Ao%CC%88o%CC%88
|
||||
extract from "a%CC%8Ao%CC%88o%CC%88" "2" graphemes - grapheme_extract = a%CC%8Ao%CC%88 == a%CC%8Ao%CC%88
|
||||
extract from "a%CC%8Ao%CC%88c" "1" graphemes - grapheme_extract = a%CC%8A == a%CC%8A
|
||||
|
||||
Reference in New Issue
Block a user