mirror of
https://github.com/php/php-src.git
synced 2026-04-23 07:58:20 +02:00
c211e67b4e
The documentation for mb_strcut states:
mb_strcut(
string $string,
int $start,
?int $length = null,
?string $encoding = null
): string
mb_strcut() extracts a substring from a string similarly to mb_substr(),
but operates on bytes instead of characters. If the cut position happens
to be between two bytes of a multi-byte character, the cut is performed
starting from the first byte of that character.
My understanding of the $length parameter for mb_strcut is that it
specified the range of bytes to extract from $string, and that all
characters encoded by those bytes should be included in the returned
string, even if that means the returned string would be longer than
$length bytes. This can happen either if 1) there is more than one way
to encode the same character in $encoding, and one way requires more
bytes than the other, or 2) $encoding uses escape sequences.
However, discussion with users of mb_strcut indicates that many of them
interpret $length as the maximum length of the *returned* string.
This is also the historical behavior of the function.
Hence, there is no need to modify the behavior of mb_strcut and then
remove XFAIL from these test cases afterwards. We can keep the current
behavior.
98 lines
2.7 KiB
PHP
98 lines
2.7 KiB
PHP
--TEST--
|
|
Test output of mb_strcut for text encodings which use escape sequences
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--FILE--
|
|
<?php
|
|
|
|
$encodings = [
|
|
'JIS',
|
|
'ISO-2022-JP',
|
|
'ISO-2022-JP-2004',
|
|
];
|
|
|
|
$input = '宛如繁星般宛如皎月般';
|
|
$bytes_length = 15;
|
|
foreach($encodings as $encoding) {
|
|
$converted_str = mb_convert_encoding($input, $encoding, mb_internal_encoding());
|
|
$cut_str = mb_strcut($converted_str, 0, $bytes_length, $encoding);
|
|
$reconverted_str = mb_convert_encoding($cut_str, mb_internal_encoding(), $encoding);
|
|
echo $encoding.': '.$reconverted_str.PHP_EOL;
|
|
}
|
|
|
|
echo PHP_EOL;
|
|
|
|
$input = '星のように月のように';
|
|
$bytes_length = 20;
|
|
foreach($encodings as $encoding) {
|
|
$converted_str = mb_convert_encoding($input, $encoding, mb_internal_encoding());
|
|
$cut_str = mb_strcut($converted_str, 0, $bytes_length, $encoding);
|
|
$reconverted_str = mb_convert_encoding($cut_str, mb_internal_encoding(), $encoding);
|
|
echo $encoding.': '.$reconverted_str.PHP_EOL;
|
|
}
|
|
|
|
echo PHP_EOL;
|
|
|
|
$input = 'あaいb';
|
|
$bytes_length = 10;
|
|
foreach($encodings as $encoding) {
|
|
$converted_str = mb_convert_encoding($input, $encoding, mb_internal_encoding());
|
|
$cut_str = mb_strcut($converted_str, 0, $bytes_length, $encoding);
|
|
$reconverted_str = mb_convert_encoding($cut_str, mb_internal_encoding(), $encoding);
|
|
echo $encoding.': '.$reconverted_str.PHP_EOL;
|
|
}
|
|
|
|
echo PHP_EOL;
|
|
|
|
$input = 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA';
|
|
$bytes_length = 10;
|
|
foreach($encodings as $encoding) {
|
|
$converted_str = mb_convert_encoding($input, $encoding, mb_internal_encoding());
|
|
$cut_str = mb_strcut($converted_str, 0, $bytes_length, $encoding);
|
|
$reconverted_str = mb_convert_encoding($cut_str, mb_internal_encoding(), $encoding);
|
|
echo $encoding.': '.$reconverted_str.PHP_EOL;
|
|
}
|
|
|
|
echo PHP_EOL;
|
|
|
|
$input = '???';
|
|
$bytes_length = 2;
|
|
foreach($encodings as $encoding) {
|
|
$converted_str = mb_convert_encoding($input, $encoding, mb_internal_encoding());
|
|
$cut_str = mb_strcut($converted_str, 0, $bytes_length, $encoding);
|
|
$reconverted_str = mb_convert_encoding($cut_str, mb_internal_encoding(), $encoding);
|
|
echo $encoding.': '.$reconverted_str.PHP_EOL;
|
|
}
|
|
|
|
echo PHP_EOL;
|
|
|
|
foreach($encodings as $encoding) {
|
|
var_dump(mb_strcut($input, 0, $bytes_length, $encoding));
|
|
}
|
|
|
|
?>
|
|
--EXPECTF--
|
|
JIS: 宛如繁星
|
|
ISO-2022-JP: 宛如繁星
|
|
ISO-2022-JP-2004: 宛如繁星
|
|
|
|
JIS: 星のように月の
|
|
ISO-2022-JP: 星のように月の
|
|
ISO-2022-JP-2004: 星のように月
|
|
|
|
JIS: あa
|
|
ISO-2022-JP: あa
|
|
ISO-2022-JP-2004: あa
|
|
|
|
JIS: AAAAAAAAAA
|
|
ISO-2022-JP: AAAAAAAAAA
|
|
ISO-2022-JP-2004: AAAAAAAAAA
|
|
|
|
JIS: ??
|
|
ISO-2022-JP: ??
|
|
ISO-2022-JP-2004: ??
|
|
|
|
string(2) "??"
|
|
string(2) "??"
|
|
string(2) "??"
|