mirror of
https://github.com/php/php-src.git
synced 2026-04-25 08:58:28 +02:00
831abe2d90
Also remove a bogus test (bug62545.phpt) which wrongly assumed that all invalid characters in CP1251 and CP1252 should map to Unicode 0xFFFD (REPLACEMENT CHARACTER). mbstring has an interface to specify what invalid characters should be replaced with; it's called `mb_substitute_character`. If a user wants to see the Unicode 'replacement character', they can specify that using `mb_substitute_character`. But if they specify something else, we should follow that.
202 lines
7.4 KiB
PHP
202 lines
7.4 KiB
PHP
<?php
|
|
|
|
// Common code for tests which focus on conversion and verification of text
|
|
// in some specific encoding
|
|
|
|
// Read a file with one character and its equivalent Unicode codepoint on each
|
|
// line, delimited by tabs
|
|
function readConversionTable($path, &$from, &$to, $utf32 = false) {
|
|
$from = array();
|
|
$to = array();
|
|
|
|
$fp = fopen($path, 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
|
|
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
|
|
if ($char <= 0xFF)
|
|
$char = chr($char); // hex codes must not have leading zero bytes
|
|
else if ($char <= 0xFFFF)
|
|
$char = pack('n', $char);
|
|
else if ($char <= 0xFFFFFF)
|
|
$char = chr($char >> 16) . pack('n', $char & 0xFFFF);
|
|
else
|
|
$char = pack('N', $char);
|
|
$from[$char] = $codepoint;
|
|
$to[$codepoint] = $char;
|
|
}
|
|
}
|
|
}
|
|
|
|
function dbgPrint($str) {
|
|
$result = '';
|
|
if (mb_check_encoding($str, 'ASCII'))
|
|
$result .= '"' . $str . '" ';
|
|
return $result . "(" . bin2hex($str) . ")";
|
|
}
|
|
|
|
function identifyValidString($goodString, $encoding) {
|
|
$result = mb_check_encoding($goodString, $encoding);
|
|
if (!$result)
|
|
die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
|
|
}
|
|
|
|
function identifyInvalidString($badString, $encoding) {
|
|
$result = mb_check_encoding($badString, $encoding);
|
|
if ($result)
|
|
die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
|
|
}
|
|
|
|
function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
|
|
$result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
|
|
if ($result !== $toString)
|
|
die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
|
|
}
|
|
|
|
function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
|
|
$illegalChars = mb_get_info('illegal_chars');
|
|
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
|
|
if (mb_get_info('illegal_chars') !== $illegalChars)
|
|
die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
|
|
}
|
|
|
|
function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
|
|
testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
|
|
if ($bothWays)
|
|
testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
|
|
}
|
|
|
|
function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
|
|
$illegalChars = mb_get_info('illegal_chars');
|
|
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
|
|
if (mb_get_info('illegal_chars') <= $illegalChars)
|
|
die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
|
|
}
|
|
|
|
function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
|
|
identifyValidString($fromString, $fromEncoding);
|
|
convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
|
|
}
|
|
|
|
function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
|
|
identifyInvalidString($fromString, $fromEncoding);
|
|
convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
|
|
}
|
|
|
|
// Only for encodings where valid characters can be concatenated together in any
|
|
// way, without any escape sequences
|
|
function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
|
|
$goodChars = array_keys($charMap);
|
|
shuffle($goodChars);
|
|
while (!empty($goodChars)) {
|
|
$length = min(rand(5,10), count($goodChars));
|
|
$fromString = $toString = '';
|
|
while ($length--) {
|
|
$goodChar = array_pop($goodChars);
|
|
$fromString .= $goodChar;
|
|
$toString .= $charMap[$goodChar];
|
|
}
|
|
|
|
testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
|
|
}
|
|
}
|
|
|
|
function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
|
|
$badChars = array_keys($badChars);
|
|
$goodChars = array();
|
|
while (!empty($badChars)) {
|
|
if (empty($goodChars)) {
|
|
$goodChars = array_keys($charMap);
|
|
shuffle($goodChars);
|
|
}
|
|
$goodChar = array_pop($goodChars);
|
|
$fromString = array_pop($badChars) . $goodChar;
|
|
$toString = $replacement . $charMap[$goodChar];
|
|
|
|
testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
|
|
}
|
|
}
|
|
|
|
function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
|
|
$truncatedChars = array_keys($truncated);
|
|
foreach ($truncatedChars as $truncatedChar) {
|
|
testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
|
|
}
|
|
}
|
|
|
|
// For variable-width encodings, where we have an exhaustive list of
|
|
// all valid characters of any width
|
|
//
|
|
// `$startBytes` maps from first-byte values to the corresponding character length
|
|
// (For encodings where the first byte can tell you the length of a multi-byte
|
|
// character)
|
|
// Note that `$startBytes` can be partial!
|
|
function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
|
|
$invalid = array();
|
|
$truncated = array();
|
|
$prefixes = array(); /* All sequences which are not (but can start) a valid character */
|
|
|
|
foreach ($valid as $char => $unicode) {
|
|
for ($len = 1; $len < strlen($char); $len++)
|
|
$prefixes[substr($char, 0, $len)] = true;
|
|
}
|
|
|
|
$varLength = function($prefix) use($valid, $invalid, $truncated) {
|
|
for ($byte = 0; $byte < 256; $byte++) {
|
|
$str = $prefix . chr($byte);
|
|
if (!isset($valid[$str])) {
|
|
if (isset($prefixes[$str])) {
|
|
$truncated[$str] = true;
|
|
$varLength($str);
|
|
} else {
|
|
$invalid[$str] = true;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
$fixedLength = function($prefix, $remaining) use($valid, $invalid, $truncated) {
|
|
if ($remaining == 0) {
|
|
if (!isset($valid[$prefix]))
|
|
$invalid[$prefix] = true;
|
|
} else if ($remaining == 1) {
|
|
$truncated[$prefix] = true;
|
|
for ($i = 0; $i < 256; $i++) {
|
|
$str = $prefix . chr($i);
|
|
if (!isset($valid[$str]))
|
|
$invalid[$str] = true;
|
|
}
|
|
} else {
|
|
$truncated[$prefix] = true;
|
|
for ($i = 0; $i < 256; $i++)
|
|
$fixedLength($prefix . chr($i), $remaining - 1);
|
|
}
|
|
};
|
|
|
|
for ($byte = 0; $byte < 256; $byte++) {
|
|
if (isset($startBytes[$byte])) {
|
|
$fixedLength(chr($byte), $startBytes[$byte] - 1);
|
|
} else {
|
|
$str = chr($byte);
|
|
if (!isset($valid[$str])) {
|
|
if (isset($prefixes[$str])) {
|
|
$truncated[$str] = true;
|
|
$varLength($str);
|
|
} else {
|
|
$invalid[$str] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper for building `$startBytes` map for above function
|
|
function map($keys, $value, $array = array()) {
|
|
foreach ($keys as $key)
|
|
$array[$key] = $value;
|
|
return $array;
|
|
}
|
|
|
|
?>
|