mirror of
https://github.com/php/php-src.git
synced 2026-04-01 13:12:16 +02:00
This is a mix of more automated and manual migration. It should remove all applicable extension_loaded() checks outside of skipif.inc files.
331 lines
10 KiB
PHP
331 lines
10 KiB
PHP
--TEST--
|
|
Exhaustive test of ISO-2022-JP-2004 encoding verification and conversion
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--SKIPIF--
|
|
<?php
|
|
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
|
?>
|
|
--FILE--
|
|
<?php
|
|
srand(111); /* Make results consistent */
|
|
include('encoding_tests.inc');
|
|
mb_substitute_character(0x25); // '%'
|
|
|
|
/* Read in table of all characters in JISX-0208 charset */
|
|
$jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */
|
|
$fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
|
|
if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) {
|
|
$jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP);
|
|
}
|
|
}
|
|
|
|
/* The JIS X 0208 character set does not have a single, straightforward
|
|
* mapping to the Unicode character set
|
|
* mbstring converts one character differently from the mappings in
|
|
* data/JISX0208.txt, which comes from the Unicode Consortium */
|
|
|
|
/* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary
|
|
* backslash, or 0xFF3C for a _fullwidth_ one */
|
|
$jisx0208Chars["\x21\x40"] = "\xFF\x3C";
|
|
|
|
/* Single bytes from 0x0-0x20 are allowed */
|
|
for ($i = 0; $i <= 0x20; $i++) {
|
|
if ($i != 0x1B)
|
|
$jisx0208Chars[chr($i)] = "\x00" . chr($i);
|
|
}
|
|
/* As is 0x7F */
|
|
$jisx0208Chars["\x7F"] = "\x00\x7F";
|
|
|
|
/* Now read table of JISX-0213:2004 plane 1 and JISX-0213:2000 plane 2 chars */
|
|
$jisx0213_2004_1Chars = array();
|
|
$jisx0213_2000_2Chars = array();
|
|
$fp = fopen(__DIR__ . '/data/ISO-2022-JP-2004-JISX0213.txt', 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
|
|
$cp2 = null;
|
|
if (sscanf($line, "%d-%x\tU+%x+%x", $type, $bytes, $cp1, $cp2) >= 3) {
|
|
if ($cp1 <= 0xFFFF)
|
|
$unicode = pack('n', $cp1);
|
|
else
|
|
$unicode = mb_convert_encoding(pack('N', $cp1), 'UTF-16BE', 'UTF-32BE');
|
|
if ($cp2)
|
|
$unicode .= pack('n', $cp2);
|
|
|
|
if ($type == 3)
|
|
$jisx0213_2004_1Chars[pack('n', $bytes)] = $unicode;
|
|
else if ($type == 4)
|
|
$jisx0213_2000_2Chars[pack('n', $bytes)] = $unicode;
|
|
}
|
|
}
|
|
|
|
/* JISX 0213 plane 1 0x2131 is an overline; Unicode has a halfwidth overline
|
|
* at 0x203E and a fullwidth overline at 0xFFE3
|
|
* We'll use the fullwidth version when converting JISX 0213 to Unicode */
|
|
$jisx0213_2004_1Chars["\x21\x31"] = "\xFF\xE3";
|
|
/* Same deal with the Yen sign; use the fullwidth one */
|
|
$jisx0213_2004_1Chars["\x21\x6F"] = "\xFF\xE5";
|
|
|
|
/* Since JISX 0213 is an extension of JISX 0208, allow the same single-byte chars */
|
|
for ($i = 0; $i <= 0x20; $i++) {
|
|
if ($i != 0x1B)
|
|
$jisx0213_2004_1Chars[chr($i)] = "\x00" . chr($i);
|
|
}
|
|
$jisx0213_2004_1Chars["\x7F"] = "\x00\x7F";
|
|
|
|
for ($i = 0; $i <= 0x20; $i++) {
|
|
if ($i != 0x1B)
|
|
$jisx0213_2000_2Chars[chr($i)] = "\x00" . chr($i);
|
|
}
|
|
$jisx0213_2000_2Chars["\x7F"] = "\x00\x7F";
|
|
|
|
function testValid($from, $to, $bothWays = true) {
|
|
identifyValidString($from, 'ISO-2022-JP-2004');
|
|
convertValidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE', false);
|
|
|
|
if ($bothWays) {
|
|
/* Try going in the opposite direction too
|
|
* ESC ( B at the beginning of ISO-2022-JP-2004 string is redundant,
|
|
* since ASCII mode is the default */
|
|
if (substr($from, 0, 3) == "\x1B(B")
|
|
$from = substr($from, 3, strlen($from) - 3);
|
|
/* If the ISO-2022-JP-2004 string switches to a different charset, it
|
|
* should switch back to ASCII at the end */
|
|
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B\$(Q") !== false || strpos($from, "\x1B\$(P") !== false)
|
|
$from .= "\x1B(B";
|
|
|
|
convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-JP-2004', false);
|
|
}
|
|
}
|
|
|
|
function testInvalid($from, $to) {
|
|
testInvalidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE');
|
|
}
|
|
|
|
/* Try all ASCII characters */
|
|
for ($i = 0; $i <= 0x7F; $i++) {
|
|
if ($i == 0x1B)
|
|
continue;
|
|
testValid(chr($i), "\x00" . chr($i));
|
|
}
|
|
|
|
/* Try all ASCII characters, with explicit ASCII escape */
|
|
for ($i = 0; $i <= 0x7F; $i++) {
|
|
if ($i == 0x1B)
|
|
continue;
|
|
testValid("\x1B(B" . chr($i), "\x00" . chr($i));
|
|
}
|
|
|
|
echo "Encoding verification and conversion works for all ASCII characters\n";
|
|
|
|
/* Try a bare ESC */
|
|
identifyInvalidString("\x1B", 'ISO-2022-JP-2004');
|
|
|
|
/* Try all non-ASCII, non-ESC single bytes */
|
|
for ($i = 0x80; $i <= 0xFF; $i++) {
|
|
testInvalid(chr($i), "\x00%");
|
|
}
|
|
|
|
echo "Encoding verification and conversion rejects all invalid single bytes\n";
|
|
|
|
/* All valid JISX0208 characters */
|
|
foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
|
|
/* Since JIS X 0213 charset is a superset of JIS X 0208, we don't bother
|
|
* using JIS X 0208 when converting Unicode to ISO-2022-JP-2004
|
|
* Therefore, don't test conversion in both directions here */
|
|
testValid("\x1B\$B" . $jisx0208, $utf16BE, false);
|
|
}
|
|
|
|
/* All invalid 1-byte JISX0208 characters */
|
|
for ($i = 0; $i < 256; $i++) {
|
|
if ($i == 0x1B)
|
|
continue;
|
|
if ($i >= 0x21 && $i <= 0x7E)
|
|
continue;
|
|
$testString = chr($i);
|
|
if (!isset($jisx0208Chars[$testString])) {
|
|
testInvalid("\x1B\$B" . $testString, "\x00%");
|
|
}
|
|
}
|
|
|
|
/* All invalid 2-byte JISX0208 characters */
|
|
for ($i = 0x21; $i <= 0x7E; $i++) {
|
|
for ($j = 0; $j < 256; $j++) {
|
|
$testString = chr($i) . chr($j);
|
|
if (!isset($jisx0208Chars[$testString])) {
|
|
testInvalid("\x1B\$B" . $testString, "\x00%");
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "Encoding verification and conversion work on JISX-0208 characters\n";
|
|
|
|
/* All JISX0213 plane 1 characters */
|
|
foreach ($jisx0213_2004_1Chars as $jisx0213_2004 => $utf16BE) {
|
|
/* For single bytes, don't try conversion in both directions */
|
|
testValid("\x1B$(Q" . $jisx0213_2004, $utf16BE, $utf16BE > "\x01\x00");
|
|
}
|
|
|
|
/* All invalid 2-byte JISX0213 plane 1 characters */
|
|
for ($i = 0x21; $i <= 0x7E; $i++) {
|
|
for ($j = 0; $j < 256; $j++) {
|
|
$testString = chr($i) . chr($j);
|
|
if (!isset($jisx0213_2004_1Chars[$testString])) {
|
|
testInvalid("\x1B$(Q" . $testString, "\x00%");
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "Encoding verification and conversion work on JISX-0213:2004 plane 1 characters\n";
|
|
|
|
/* All JISX0213 plane 2 characters */
|
|
foreach ($jisx0213_2000_2Chars as $jisx0213_2000 => $utf16BE) {
|
|
/* For single bytes, don't try conversion in both directions */
|
|
testValid("\x1B$(P" . $jisx0213_2000, $utf16BE, $utf16BE > "\x01\x00");
|
|
}
|
|
|
|
/* All invalid 2-byte JISX0213 plane 2 characters */
|
|
for ($i = 0x21; $i <= 0x7E; $i++) {
|
|
for ($j = 0; $j < 256; $j++) {
|
|
$testString = chr($i) . chr($j);
|
|
if (!isset($jisx0213_2000_2Chars[$testString])) {
|
|
testInvalid("\x1B$(P" . $testString, "\x00%");
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "Encoding verification and conversion work on JISX-0213:2000 plane 2 characters\n";
|
|
|
|
/* All possible escape sequences */
|
|
$validEscapes = ["\x1B\$B" => true, "\x1B(B" => true, "\x1B$(Q" => true, "\x1B$(P" => true];
|
|
for ($i = 0; $i <= 0xFF; $i++) {
|
|
for ($j = 0; $j <= 0xFF; $j++) {
|
|
$escapeSequence = "\x1B" . chr($i) . chr($j);
|
|
if (isset($validEscapes[$escapeSequence])) {
|
|
testValid($escapeSequence, "", false);
|
|
} else {
|
|
identifyInvalidString($escapeSequence, 'ISO-2022-JP-2004');
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "All escape sequences work as expected\n";
|
|
|
|
identifyInvalidString("\x1B$", 'ISO-2022-JP-2004');
|
|
identifyInvalidString("\x1B(", 'ISO-2022-JP-2004');
|
|
identifyInvalidString("\x1B$(", 'ISO-2022-JP-2004');
|
|
|
|
echo "All incomplete escape sequences are rejected\n";
|
|
|
|
/* Try all combinations of 2 different charsets in the same string */
|
|
$ascii = "\x1B(Ba";
|
|
$jisx0208 = "\x1B\$B" . array_keys($jisx0208Chars)[rand(0,1000)];
|
|
$jisx0213_1 = "\x1B$(Q" . array_keys($jisx0213_2004_1Chars)[rand(0,1000)];
|
|
$jisx0213_2 = "\x1B$(P" . array_keys($jisx0213_2000_2Chars)[rand(0,1000)];
|
|
$differentCharsets = [$ascii, $jisx0208, $jisx0213_1, $jisx0213_2];
|
|
foreach ($differentCharsets as $a) {
|
|
foreach ($differentCharsets as $b) {
|
|
identifyValidString($a . $b, 'ISO-2022-JP-2004');
|
|
}
|
|
}
|
|
|
|
/* Try redundant escape sequences (switching mode but including any characters
|
|
* in the new mode) */
|
|
$ascii_Esc = "\x1B(B";
|
|
$jisx0208_Esc = "\x1B\$B";
|
|
$jisx0213_1_Esc = "\x1B$(Q";
|
|
$jisx0213_2_Esc = "\x1B$(P";
|
|
$differentCharsets = [$ascii_Esc, $jisx0208_Esc, $jisx0213_1_Esc, $jisx0213_2_Esc];
|
|
foreach ($differentCharsets as $a) {
|
|
foreach ($differentCharsets as $b) {
|
|
testValid($a . $b, "", false);
|
|
}
|
|
}
|
|
|
|
echo "Combining multiple charsets in the same string works as expected\n";
|
|
|
|
/* Try ending in the middle of a JISX0208 character */
|
|
testInvalid(substr($jisx0208, 0, strlen($jisx0208) - 1), "\x00%");
|
|
|
|
/* Try ending in the middle of a JISX0213 plane 1 character */
|
|
testInvalid(substr($jisx0213_1, 0, strlen($jisx0213_1) - 1), "\x00%");
|
|
|
|
/* Try ending in the middle of a JISX0213 plane 2 character */
|
|
testInvalid(substr($jisx0213_2, 0, strlen($jisx0213_2) - 1), "\x00%");
|
|
|
|
echo "Strings with truncated multi-byte characters are rejected\n";
|
|
|
|
/* We have tried converting all kinds of strings with single characters;
|
|
* now try some random examples of strings with multiple characters */
|
|
$jisx0208 = array_keys($jisx0208Chars);
|
|
shuffle($jisx0208);
|
|
$jisx0213_1 = array_keys($jisx0213_2004_1Chars);
|
|
shuffle($jisx0213_1);
|
|
$jisx0213_2 = array_keys($jisx0213_2000_2Chars);
|
|
shuffle($jisx0213_2);
|
|
|
|
for ($i = 0; $i < 100; $i++) {
|
|
$size = rand(5,20);
|
|
$testString = '';
|
|
$convertsTo = '';
|
|
|
|
/* Build a string from a random combination of characters in the supported
|
|
* character sets */
|
|
while ($size--) {
|
|
$type = rand(0,4);
|
|
$chars = rand(0,10);
|
|
if ($type == 0) { /* ASCII */
|
|
$testString .= "\x1B(B";
|
|
while ($chars--) {
|
|
$ascii = chr(rand(0x20, 0x7E));
|
|
$testString .= $ascii;
|
|
$convertsTo .= "\x00" . $ascii;
|
|
}
|
|
} else if ($type == 1) { /* JIS X 0208 */
|
|
$testString .= "\x1B\$B";
|
|
while ($chars--) {
|
|
$jis = array_pop($jisx0208);
|
|
$testString .= $jis;
|
|
$convertsTo .= $jisx0208Chars[$jis];
|
|
}
|
|
} else if ($type == 2) { /* JIS X 0213:2004 plane 1 */
|
|
$testString .= "\x1B$(Q";
|
|
while ($chars--) {
|
|
$jis = array_pop($jisx0213_1);
|
|
$testString .= $jis;
|
|
$convertsTo .= $jisx0213_2004_1Chars[$jis];
|
|
}
|
|
} else { /* JIS X 0213:2000 plane 2 */
|
|
$testString .= "\x1B$(P";
|
|
while ($chars-- && !empty($jisx0213_2)) {
|
|
$jis = array_pop($jisx0213_2);
|
|
$testString .= $jis;
|
|
$convertsTo .= $jisx0213_2000_2Chars[$jis];
|
|
}
|
|
}
|
|
}
|
|
|
|
testValid($testString, $convertsTo, false);
|
|
}
|
|
|
|
echo "All done!\n";
|
|
|
|
?>
|
|
--EXPECT--
|
|
Encoding verification and conversion works for all ASCII characters
|
|
Encoding verification and conversion rejects all invalid single bytes
|
|
Encoding verification and conversion work on JISX-0208 characters
|
|
Encoding verification and conversion work on JISX-0213:2004 plane 1 characters
|
|
Encoding verification and conversion work on JISX-0213:2000 plane 2 characters
|
|
All escape sequences work as expected
|
|
All incomplete escape sequences are rejected
|
|
Combining multiple charsets in the same string works as expected
|
|
Strings with truncated multi-byte characters are rejected
|
|
All done!
|