1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Improve mb_detect_encoding accuracy for text containing vowels with macrons

Among other world languages, the Māori language commonly uses vowels
with macrons.
This commit is contained in:
Alex Dowad
2023-08-22 22:23:15 +02:00
parent d7eb4cfdb2
commit 81faab9235
3 changed files with 26 additions and 1 deletions

View File

@@ -3,18 +3,23 @@
0x0020 0x007E # ASCII 0x0020 0x007E # ASCII
0x00A1 0x00AC # Pound sign, Yen sign, copyright sign... 0x00A1 0x00AC # Pound sign, Yen sign, copyright sign...
0x00AE 0x00FF # Accented Latin characters 0x00AE 0x00FF # Accented Latin characters
0x0101 0x0101 # a with macron
0x0104 0x0107 # Polish 0x0104 0x0107 # Polish
0x010C 0x010F # Czech 0x010C 0x010F # Czech
0x0113 0x0113 # e with macron
0x0118 0x011B # Polish, Czech 0x0118 0x011B # Polish, Czech
0x011F 0x011F # Turkish 0x011F 0x011F # Turkish
0x012B 0x012B # i with macron
0x0130 0x0131 # Turkish 0x0130 0x0131 # Turkish
0x0141 0x0144 # Polish 0x0141 0x0144 # Polish
0x0147 0x0148 # Czech 0x0147 0x0148 # Czech
0x014D 0x014D # o with macron
0x0150 0x0151 # Hungarian 0x0150 0x0151 # Hungarian
0x0158 0x015B # Czech, Polish 0x0158 0x015B # Czech, Polish
0x015F 0x015F # Turkish 0x015F 0x015F # Turkish
0x0160 0x0161 # Used in Slavic names 0x0160 0x0161 # Used in Slavic names
0x0164 0x0165 # Czech 0x0164 0x0165 # Czech
0x016B 0x016B # u with macron
0x016E 0x016F # Czech 0x016E 0x016F # Czech
0x0170 0x0171 # Hungarian 0x0170 0x0171 # Hungarian
0x0179 0x017E # Polish, Czech, other Slavic languages 0x0179 0x017E # Polish, Czech, other Slavic languages

View File

@@ -11,7 +11,7 @@
static const uint32_t rare_codepoint_bitvec[] = { static const uint32_t rare_codepoint_bitvec[] = {
0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000, 0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
0x70ff0f0f, 0xfffcffff, 0x70fcfe61, 0x81fc3fcc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x70f70f0d, 0xfffcf7ff, 0x70fcde61, 0x81fc37cc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000, 0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

View File

@@ -85,6 +85,18 @@ $css = 'input[type="radio"]:checked + img {
}'; }';
echo mb_detect_encoding($css, mb_list_encodings(), true), "\n"; echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
// Test cases courtesy of Kirill Roskolii and Chris Burgess
echo "-- Māori text --\n";
echo mb_detect_encoding("Total Māori,31.5,33.3,31.8,33,36.4,33.2,33.2", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
// Names of native birds from Aotearoa:
echo mb_detect_encoding("Kākā", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo mb_detect_encoding("Whēkau", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo mb_detect_encoding("Tīwaiwaka", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo mb_detect_encoding("Kōtuku", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo mb_detect_encoding("Kererū", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo mb_detect_encoding("Tūī", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
echo "== DETECT ORDER ==\n"; echo "== DETECT ORDER ==\n";
mb_detect_order('auto'); mb_detect_order('auto');
@@ -408,6 +420,14 @@ UTF-8
UTF-8 UTF-8
SJIS SJIS
UTF-8 UTF-8
-- Māori text --
UTF-8
UTF-8
UTF-8
UTF-8
UTF-8
UTF-8
UTF-8
== DETECT ORDER == == DETECT ORDER ==
JIS: JIS JIS: JIS
EUC-JP: EUC-JP EUC-JP: EUC-JP