mirror of
https://github.com/php/php-src.git
synced 2026-04-19 22:11:12 +02:00
As a performance optimization, mbstring implements some functions using tables which give the (byte) length of a multi-byte character using a lookup based on the value of the first byte. These tables are called `mblen_table`. For many years, the mblen_table for SJIS has had '2' in position 0x80. That is wrong; it should have been '1'. Reasons: For SJIS, SJIS-2004, and mobile variants of SJIS, 0x80 has never been treated as the first byte of a 2-byte character. It has always been treated as a single erroneous byte. On the other hand, 0x80 is a valid character in MacJapanese... but a 1-byte character, not a 2-byte one. The same applies to bytes 0xFD-FF; these are 1-byte characters in MacJapanese, and in other SJIS variants, they are not valid (as the first byte of a character). Thanks to the GitHub user 'youkidearitai' for finding this problem.
100 lines
2.9 KiB
PHP
100 lines
2.9 KiB
PHP
--TEST--
|
|
mb_str_split() tests for the japanese language
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--INI--
|
|
output_handler=
|
|
--FILE--
|
|
<?php
|
|
ini_set('include_path','.');
|
|
include_once('common.inc');
|
|
|
|
$string = "日本"; /* 2 chars */
|
|
$len = 2;
|
|
$charset = [
|
|
"BIG-5",
|
|
"EUC-JP",
|
|
"ISO-2022-JP",
|
|
"SJIS",
|
|
"UTF-16BE",
|
|
"UTF-16LE",
|
|
"UTF-32BE",
|
|
"UTF-32LE",
|
|
"UTF-8"
|
|
];
|
|
|
|
|
|
foreach($charset as $cs){
|
|
$enc = mb_convert_encoding($string, $cs, "UTF-8");
|
|
$split = mb_str_split($enc, 1, $cs);
|
|
|
|
/* check chunks number */
|
|
for($i = 1; $i <= $len; ++$i){
|
|
$ceil = ceil($len / $i);
|
|
$cnt = count(mb_str_split($enc,$i,$cs));
|
|
if($ceil != $cnt){
|
|
echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
|
|
}
|
|
}
|
|
|
|
/* check content */
|
|
echo "$cs:";
|
|
for($i = 0; $i < $len; ++$i){
|
|
echo " " . unpack("H*", $split[$i])[1];
|
|
}
|
|
echo "\n";
|
|
}
|
|
|
|
/* long string test */
|
|
$size = 50000;
|
|
$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */
|
|
$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8");
|
|
$array = mb_str_split($enc, $len, "ISO-2022-JP");
|
|
$count = count($array);
|
|
|
|
/* check array size */
|
|
if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
|
|
|
|
/* compare initial string and last array element after splitting */
|
|
$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8");
|
|
if(end($array) !== $enc){
|
|
printf("Long string splitting error:
|
|
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
|
|
}
|
|
|
|
/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
|
|
echo "== Regression test for SJIS byte 0x80 ==\n";
|
|
foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
|
|
$array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
|
|
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
|
|
|
|
// Also try bytes 0xFD, 0xFE, and 0xFF
|
|
$array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
|
|
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
|
|
}
|
|
|
|
?>
|
|
--EXPECT--
|
|
BIG-5: a4e9 a5bb
|
|
EUC-JP: c6fc cbdc
|
|
ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842
|
|
SJIS: 93fa 967b
|
|
UTF-16BE: 65e5 672c
|
|
UTF-16LE: e565 2c67
|
|
UTF-32BE: 000065e5 0000672c
|
|
UTF-32LE: e5650000 2c670000
|
|
UTF-8: e697a5 e69cac
|
|
== Regression test for SJIS byte 0x80 ==
|
|
SJIS: [80a1, 6162, 6380, a1]
|
|
SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
|
|
SJIS-2004: [80a1, 6162, 6380, a1]
|
|
SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
|
|
MacJapanese: [80a1, 6162, 6380, a1]
|
|
MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
|
|
SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
|
|
SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
|
|
SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
|
|
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
|
|
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
|
|
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
|