1
0
mirror of https://github.com/php/php-src.git synced 2026-04-02 13:43:02 +02:00
Files
archived-php-src/ext/mbstring/tests/hz_encoding.phpt
Alex Dowad 1e5c3c13fd Fix conversion of HZ text (and add test suite)
- Treat truncated multi-byte characters as an error.
- Don't allow ASCII control characters to appear in the middle of a
  multi-byte character.
- Handle ~ escapes according to the HZ standard (RFC 1843).
- Treat unrecognized ~ escapes as an error.
- Multi-byte characters (between ~{ ~} escapes) are GB2312, not CP936.
  (CP936 is an extended version from MicroSoft, but the RFC does not
  state that this extended version of GB should be used.)
2021-06-29 12:25:21 +02:00

130 lines
3.9 KiB
PHP

--TEST--
Exhaustive test of verification and conversion of HZ text
--EXTENSIONS--
mbstring
--SKIPIF--
<?php
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
include('encoding_tests.inc');
srand(1000); // Make results consistent
mb_substitute_character(0x25); // '%'
for ($i = 0; $i < 0x80; $i++) {
if ($i != 0x7E) // ~ is special and will be tested separately
testValidString(chr($i), chr($i), 'ASCII', 'HZ');
}
echo "Tested ASCII -> HZ\n";
for ($i = 0; $i < 0x80; $i++) {
if ($i != 0x7E)
testValidString(chr($i), chr($i), 'HZ', 'ASCII');
}
echo "Tested HZ -> ASCII\n";
for ($i = 0x80; $i < 0xFF; $i++) {
testInvalidString(chr($i), '%', 'HZ', 'ASCII');
}
echo "Tested non-ASCII bytes in ASCII mode\n";
testValidString('~~', '~', 'HZ', 'ASCII');
testValidString("~\n", '', 'HZ', 'ASCII', false);
testValidString('~{~}', '', 'HZ', 'ASCII', false);
testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
echo "Tested valid ~ escapes\n";
for ($i = 0; $i < 0xFF; $i++) {
if ($i != 0x0A) {
// Try invalid ~ escapes both in ASCII and GB modes
if ($i != 0x7E && $i != 0x7B) // not {
testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
if ($i != 0x7D) // not }
testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
}
}
echo "Tested all invalid ~ escapes\n";
readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);
findInvalidChars($toUnicode, $invalid, $truncated);
// Two characters in ISO-2022-CN convert to Unicode 0x2225
$irreversible = ["\x21\x2C" => true];
// Test all good GB2312 characters within ~{ ~} escapes
$goodChars = array_keys($toUnicode);
shuffle($goodChars);
while (!empty($goodChars)) {
$reversible = true;
$length = 1; //min(rand(5,10), count($goodChars));
$fromString = $toString = '';
while ($length--) {
$goodChar = array_pop($goodChars);
$fromString .= $goodChar;
$toString .= $toUnicode[$goodChar];
if (isset($irreversible[$goodChar]))
$reversible = false;
}
testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
}
// Test all invalid GB2312 characters within ~{ ~} escapes
// However, don't test escape sequences; we will do those separately below
unset($invalid["~"]);
$badChars = array_keys($invalid);
$goodChars = array();
while (!empty($badChars)) {
if (empty($goodChars)) {
$goodChars = array_keys($toUnicode);
shuffle($goodChars);
}
$goodChar = array_pop($goodChars);
$fromString = array_pop($badChars) . $goodChar;
$toString = "\x00%" . $toUnicode[$goodChar];
testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
}
$truncatedChars = array_keys($truncated);
foreach ($truncatedChars as $truncatedChar) {
testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
}
echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
// Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
// valid to convert to HZ
for ($i = 0; $i <= 0x7F; $i++)
unset($invalid["\x00" . chr($i)]);
$badChars = array_keys($invalid);
$goodChars = array();
while (!empty($badChars)) {
if (empty($goodChars)) {
$goodChars = array_keys($fromUnicode);
shuffle($goodChars);
}
$goodChar = array_pop($goodChars);
$fromString = array_pop($badChars) . $goodChar;
$toString = "%~{" . $fromUnicode[$goodChar] . "~}";
convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
}
echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
?>
--EXPECT--
Tested ASCII -> HZ
Tested HZ -> ASCII
Tested non-ASCII bytes in ASCII mode
Tested valid ~ escapes
Tested all invalid ~ escapes
Tested HZ -> UTF-16BE (for all GB2312 characters)
Tested UTF-16BE -> HZ (for all GB2312 characters)