1
0
mirror of https://github.com/php/php-src.git synced 2026-04-25 08:58:28 +02:00
Files
archived-php-src/ext/mbstring/tests/encoding_tests.inc
T
Alex Dowad 831abe2d90 Add test suite for CP1252 encoding
Also remove a bogus test (bug62545.phpt) which wrongly assumed that all invalid
characters in CP1251 and CP1252 should map to Unicode 0xFFFD (REPLACEMENT
CHARACTER).

mbstring has an interface to specify what invalid characters should be
replaced with; it's called `mb_substitute_character`. If a user wants to see
the Unicode 'replacement character', they can specify that using
`mb_substitute_character`. But if they specify something else, we should
follow that.
2020-10-30 22:13:27 +02:00

202 lines
7.4 KiB
PHP

<?php
// Common code for tests which focus on conversion and verification of text
// in some specific encoding
// Read a file with one character and its equivalent Unicode codepoint on each
// line, delimited by tabs
function readConversionTable($path, &$from, &$to, $utf32 = false) {
$from = array();
$to = array();
$fp = fopen($path, 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
continue;
if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
if ($char <= 0xFF)
$char = chr($char); // hex codes must not have leading zero bytes
else if ($char <= 0xFFFF)
$char = pack('n', $char);
else if ($char <= 0xFFFFFF)
$char = chr($char >> 16) . pack('n', $char & 0xFFFF);
else
$char = pack('N', $char);
$from[$char] = $codepoint;
$to[$codepoint] = $char;
}
}
}
function dbgPrint($str) {
$result = '';
if (mb_check_encoding($str, 'ASCII'))
$result .= '"' . $str . '" ';
return $result . "(" . bin2hex($str) . ")";
}
function identifyValidString($goodString, $encoding) {
$result = mb_check_encoding($goodString, $encoding);
if (!$result)
die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
}
function identifyInvalidString($badString, $encoding) {
$result = mb_check_encoding($badString, $encoding);
if ($result)
die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
}
function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
$result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
if ($result !== $toString)
die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
}
function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
$illegalChars = mb_get_info('illegal_chars');
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
if (mb_get_info('illegal_chars') !== $illegalChars)
die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
}
function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
if ($bothWays)
testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
}
function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
$illegalChars = mb_get_info('illegal_chars');
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
if (mb_get_info('illegal_chars') <= $illegalChars)
die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
}
function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
identifyValidString($fromString, $fromEncoding);
convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
}
function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
identifyInvalidString($fromString, $fromEncoding);
convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
}
// Only for encodings where valid characters can be concatenated together in any
// way, without any escape sequences
function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
$goodChars = array_keys($charMap);
shuffle($goodChars);
while (!empty($goodChars)) {
$length = min(rand(5,10), count($goodChars));
$fromString = $toString = '';
while ($length--) {
$goodChar = array_pop($goodChars);
$fromString .= $goodChar;
$toString .= $charMap[$goodChar];
}
testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
}
}
function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
$badChars = array_keys($badChars);
$goodChars = array();
while (!empty($badChars)) {
if (empty($goodChars)) {
$goodChars = array_keys($charMap);
shuffle($goodChars);
}
$goodChar = array_pop($goodChars);
$fromString = array_pop($badChars) . $goodChar;
$toString = $replacement . $charMap[$goodChar];
testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
}
}
function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
$truncatedChars = array_keys($truncated);
foreach ($truncatedChars as $truncatedChar) {
testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
}
}
// For variable-width encodings, where we have an exhaustive list of
// all valid characters of any width
//
// `$startBytes` maps from first-byte values to the corresponding character length
// (For encodings where the first byte can tell you the length of a multi-byte
// character)
// Note that `$startBytes` can be partial!
function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
$invalid = array();
$truncated = array();
$prefixes = array(); /* All sequences which are not (but can start) a valid character */
foreach ($valid as $char => $unicode) {
for ($len = 1; $len < strlen($char); $len++)
$prefixes[substr($char, 0, $len)] = true;
}
$varLength = function($prefix) use($valid, $invalid, $truncated) {
for ($byte = 0; $byte < 256; $byte++) {
$str = $prefix . chr($byte);
if (!isset($valid[$str])) {
if (isset($prefixes[$str])) {
$truncated[$str] = true;
$varLength($str);
} else {
$invalid[$str] = true;
}
}
}
};
$fixedLength = function($prefix, $remaining) use($valid, $invalid, $truncated) {
if ($remaining == 0) {
if (!isset($valid[$prefix]))
$invalid[$prefix] = true;
} else if ($remaining == 1) {
$truncated[$prefix] = true;
for ($i = 0; $i < 256; $i++) {
$str = $prefix . chr($i);
if (!isset($valid[$str]))
$invalid[$str] = true;
}
} else {
$truncated[$prefix] = true;
for ($i = 0; $i < 256; $i++)
$fixedLength($prefix . chr($i), $remaining - 1);
}
};
for ($byte = 0; $byte < 256; $byte++) {
if (isset($startBytes[$byte])) {
$fixedLength(chr($byte), $startBytes[$byte] - 1);
} else {
$str = chr($byte);
if (!isset($valid[$str])) {
if (isset($prefixes[$str])) {
$truncated[$str] = true;
$varLength($str);
} else {
$invalid[$str] = true;
}
}
}
}
}
// Helper for building `$startBytes` map for above function
function map($keys, $value, $array = array()) {
foreach ($keys as $key)
$array[$key] = $value;
return $array;
}
?>