1
0
mirror of https://github.com/php/php-src.git synced 2026-03-29 19:52:20 +02:00
Files
archived-php-src/ext/mbstring/tests/mb_convert_encoding.phpt
Alex Dowad 3e7acf901d Remove mbstring identify filters
mbstring had an 'identify filter' for almost every supported text encoding
which was used when auto-detecting the most likely encoding for a string.
It would run over the string and set a 'flag' if it saw anything which
did not appear likely to be the encoding in question.

One problem with this scheme was that encodings which merely appeared
less likely to be the correct one were completely rejected, even if there
was no better candidate. Another problem was that the 'identify filters'
had a huge amount of code duplication with the 'conversion filters'.

Eliminate the identify filters. Instead, when auto-detecting text
encoding, use conversion filters to see whether the input string is valid
in candidate encodings or not. At the same type, watch the type of
codepoints which the string decodes to and mark it as less likely if
non-printable characters (ESC, form feed, bell, etc.) or 'private use
area' codepoints are seen.

Interestingly, one old test case in which JIS text was misidentified
as UTF-8 (and this wrong behavior was enshrined in the test) was 'fixed'
and the JIS string is now auto-detected as JIS.
2020-11-09 13:45:17 +02:00

128 lines
3.7 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
--TEST--
mb_convert_encoding()
--SKIPIF--
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--INI--
output_handler=
mbstring.language=Japanese
--FILE--
<?php
// TODO: Add more tests
// SJIS string (BASE64 encoded)
$sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
// JIS string (BASE64 encoded)
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
// EUC-JP string
$euc_jp = '日本語テキストです。01234。';
// Test with single "form encoding"
// Note: For some reason it complains, results are different. Not researched.
echo "== BASIC TEST ==\n";
$s = $sjis;
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', 'SJIS'));
print("EUC-JP: $s\n"); // EUC-JP
$s = $jis;
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', 'JIS'));
print("EUC-JP: $s\n"); // EUC-JP
$s = $euc_jp;
$s = mb_convert_encoding($s, 'SJIS', 'EUC-JP');
print("SJIS: ".base64_encode($s)."\n"); // SJIS
$s = $euc_jp;
$s = mb_convert_encoding($s, 'JIS', 'EUC-JP');
print("JIS: ".base64_encode($s)."\n"); // JIS
// Using Encoding List Array
echo "== STRING ENCODING LIST ==\n";
$a = 'JIS,UTF-8,EUC-JP,SJIS';
$s = $jis;
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', $a));
print("EUC-JP: $s\n"); // EUC-JP
$s = $euc_jp;
$s = mb_convert_encoding($s, 'SJIS', $a);
print("SJIS: ".base64_encode($s)."\n"); // SJIS
$s = $euc_jp;
$s = mb_convert_encoding($s, 'JIS', $a);
print("JIS: ".base64_encode($s)."\n"); // JIS
// Using Encoding List Array
echo "== ARRAY ENCODING LIST ==\n";
$a = array(0=>'JIS', 1=>'UTF-8', 2=>'EUC-JP', 3=>'SJIS');
$s = $jis;
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', $a));
print("EUC-JP: $s\n"); // EUC-JP
$s = $euc_jp;
$s = mb_convert_encoding($s, 'SJIS', $a);
print("SJIS: ".base64_encode($s)."\n"); // SJIS
$s = $euc_jp;
$s = mb_convert_encoding($s, 'JIS', $a);
print("JIS: ".base64_encode($s)."\n"); // JIS
// Using Detect Order
echo "== DETECT ORDER ==\n";
$s = $jis;
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', 'auto'));
print("EUC-JP: $s\n"); // EUC-JP
$s = $euc_jp;
$s = mb_convert_encoding($s, 'SJIS', 'auto');
print("SJIS: ".base64_encode($s)."\n"); // SJIS
$s = $euc_jp;
$s = mb_convert_encoding($s, 'JIS', 'auto');
print("JIS: ".base64_encode($s)."\n"); // JIS
// Invalid Parameters
echo "== INVALID PARAMETER ==\n";
$s = mb_convert_encoding(1234, 'EUC-JP');
print("INT: $s\n");
$s = mb_convert_encoding('', 'EUC-JP');
print("EUC-JP: $s\n"); // SJIS
$s = $euc_jp;
try {
var_dump(mb_convert_encoding($s, 'BAD'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
?>
--EXPECT--
== BASIC TEST ==
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
== STRING ENCODING LIST ==
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
== ARRAY ENCODING LIST ==
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
== DETECT ORDER ==
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
== INVALID PARAMETER ==
INT: 1234
EUC-JP:
mb_convert_encoding(): Argument #2 ($to_encoding) must be a valid encoding, "BAD" given