1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 08:12:21 +01:00

Add mbstring support for GB18030-2022 text encoding

The previous version of the GB-18030 standard was published in 2005.
This commit adds support for the updated (2022) version of this text
encoding. The existing GB18030 implementation has been left unchanged
for backwards compatibility; users who want to use the new standard
must explicitly indicate the desired text encoding is 'GB18030-2022'.

The document which defines GB18030-2022, published by the government
of the People's Republic of China, defines three levels of standards
compliance. This implementation is intended to achieve Implementation
Level 3, which is the highest level of compliance.

Experts in the GB18030 standard are requested to assess this
implementation and report any deviation from the standard.
This commit is contained in:
Alex Dowad
2023-12-22 21:53:41 +02:00
parent febe05198d
commit 5fdb27246c
7 changed files with 64585 additions and 116 deletions

View File

@@ -11088,7 +11088,7 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b
continue;
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
if (w == 0x1F9) {
s = 0xA8Bf;
s = 0xA8BF;
} else {
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
}
@@ -11560,6 +11560,319 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
MB_CONVERT_BUF_STORE(buf, out, limit);
}
static const unsigned short gb18030_2022_pua_tbl3[] = {
/* 0xFE50 */
0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
0x0000,0x9FB4,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x9FB5,0x0000,0x0000,0x0000,0x0000,0x9FB6,0x9FB7,
0x0000,0x0000,0x0000,0x0000,0xE831,0x9FB8,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x9FB9,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x9FBA,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
/* 0xFEA0 */
0x9FBB
};
static size_t mb_gb18030_2022_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
uint32_t *out = buf, *limit = buf + bufsize;
while (p < e && out < limit) {
unsigned char c = *p++;
if (c < 0x80) {
*out++ = c;
} else if (c == 0x80 || c == 0xFF) {
*out++ = MBFL_BAD_INPUT;
} else {
if (p == e) {
*out++ = MBFL_BAD_INPUT;
break;
}
unsigned char c2 = *p++;
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
if (p >= e) {
*out++ = MBFL_BAD_INPUT;
break;
}
unsigned char c3 = *p++;
if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
unsigned char c4 = *p++;
if (c4 >= 0x30 && c4 <= 0x39) {
if (c >= 0x90 && c <= 0xE3) {
unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
} else {
/* Unicode BMP */
unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
if (w == 0x98A4) {
*out++ = 0xE78D;
} else if (w == 0x98A6) {
*out++ = 0xE78E;
} else if (w == 0x98A5) {
*out++ = 0xE78F;
} else if (w >= 0x98A7 && w <= 0x98AD) {
*out++ = w + (0xE790 - 0x98A7);
} else if (w == 0x1D21) {
*out++ = 0xE7C7;
} else if (w == 0x4A71) {
*out++ = 0xE81E;
} else if (w == 0x4A72) {
*out++ = 0xE826;
} else if (w >= 0x4A73 && w <= 0x4A74) {
*out++ = w + (0xE82B - 0x4A73);
} else if (w == 0x4A75) {
*out++ = 0xE832;
} else if (w == 0x4A76) {
*out++ = 0xE843;
} else if (w == 0x4A77) {
*out++ = 0xE854;
} else if (w == 0x4A78) {
*out++ = 0xE864;
} else if (w <= 0x99FB) {
*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
} else {
*out++ = MBFL_BAD_INPUT;
}
}
} else {
*out++ = MBFL_BAD_INPUT;
}
} else {
*out++ = MBFL_BAD_INPUT;
}
} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
/* UDA part 1, 2: U+E000-U+E4C5 */
*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
/* UDA part 3: U+E4C6-U+E765 */
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
if (w >= 0x192B) {
if (w <= 0x1EBE) {
if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
*out++ = gb18030_2022_pua_tbl1[w - 0x192B];
continue;
}
} else if (w >= 0x413A) {
if (w <= 0x413E) {
*out++ = cp936_pua_tbl2[w - 0x413A];
continue;
} else if (w >= 0x5DD0 && w <= 0x5E20) {
unsigned int c = gb18030_2022_pua_tbl3[w - 0x5DD0];
if (c) {
*out++ = c;
continue;
}
}
}
}
if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
ZEND_ASSERT(w < cp936_ucs_table_size);
*out++ = cp936_ucs_table[w];
} else {
*out++ = MBFL_BAD_INPUT;
}
} else {
*out++ = MBFL_BAD_INPUT;
}
}
}
*in_len = e - p;
*in = p;
return out - buf;
}
static void mb_wchar_to_gb18030_2022(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
while (len--) {
uint32_t w = *in++;
unsigned int s = 0;
if (w == 0) {
out = mb_convert_buf_add(out, 0);
continue;
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
if (w == 0x1F9) {
s = 0xA8BF;
} else {
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
}
} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
if (w == 0x20AC) { /* Euro sign */
s = 0xA2E3;
} else {
s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
}
} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
} else if (w >= 0x9FB4 && w <= 0x9FBB) {
/* Newly mapped in GB18030-2022 */
if (w == 0x9FB4) {
s = 0xFE59;
} else if (w == 0x9FB5) {
s = 0xFE61;
} else if (w == 0x9FB6) {
s = 0xFE66;
} else if (w == 0x9FB7) {
s = 0xFE67;
} else if (w == 0x9FB8) {
s = 0xFE6D;
} else if (w == 0x9FB9) {
s = 0xFE7E;
} else if (w == 0x9FBA) {
s = 0xFE90;
} else {
s = 0xFEA0;
}
} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
/* U+F900-U+FA2F CJK Compatibility Ideographs */
if (w == 0xF92C) {
s = 0xFD9C;
} else if (w == 0xF979) {
s = 0xFD9D;
} else if (w == 0xF995) {
s = 0xFD9E;
} else if (w == 0xF9E7) {
s = 0xFD9F;
} else if (w == 0xF9F1) {
s = 0xFDA0;
} else if (w >= 0xFA0C && w <= 0xFA29) {
s = ucs_ci_s_cp936_table[w - 0xFA0C];
}
} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
/* CJK Compatibility Forms */
s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
/* U+FE50-U+FE6F Small Form Variants */
s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
/* U+FF00-U+FFFF HW/FW Forms */
if (w == 0xFF04) {
s = 0xA1E7;
} else if (w == 0xFF5E) {
s = 0xA1AB;
} else if (w >= 0xFF01 && w <= 0xFF5D) {
s = w - 0xFF01 + 0xA3A1;
} else if (w >= 0xFFE0 && w <= 0xFFE5) {
s = ucs_hff_s_cp936_table[w - 0xFFE0];
}
} else if (w >= 0xE000 && w <= 0xE864) {
/* PUA */
if (w < 0xE766) {
if (w < 0xE4C6) {
unsigned int c1 = w - 0xE000;
s = (c1 % 94) + 0xA1;
c1 /= 94;
s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
} else {
unsigned int c1 = w - 0xE4C6;
s = ((c1 / 96) + 0xA1) << 8;
c1 %= 96;
s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
}
} else {
/* U+E766-U+E864 */
unsigned int k1 = 0, k2 = mbfl_gb18030_2022_pua_tbl_max;
while (k1 < k2) {
unsigned int k = (k1 + k2) >> 1;
if (w < mbfl_gb18030_2022_pua_tbl[k][0]) {
k2 = k;
} else if (w > mbfl_gb18030_2022_pua_tbl[k][1]) {
k1 = k + 1;
} else {
s = w - mbfl_gb18030_2022_pua_tbl[k][0] + mbfl_gb18030_2022_pua_tbl[k][2];
break;
}
}
}
} else if (w >= 0xFE10 && w <= 0xFE19) {
/* Newly mapped codepoints in GB18030-2022 */
if (w == 0xFE11) {
s = 0xA6DB;
} else if (w == 0xFE12) {
s = 0xA6DA;
} else if (w <= 0xFE16) {
s = w - (0xFE10 - 0xA6D9);
} else if (w <= 0xFE18) {
s = w - (0xFE17 - 0xA6EC);
} else {
s = 0xA6F3;
}
} else if (w == 0x1E3F) {
/* Newly mapped codepoint in GB18030-2022 */
s = 0xA8BC;
}
/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
* do a binary search in a table of differing codepoints to see if we have one */
if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
if (i >= 0) {
s = mbfl_gb18030_c_tbl_val[i];
}
}
/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
if (!s && w >= 0x80 && w <= 0xFFFF) {
/* BMP */
int i = mbfl_bisec_srch(w, mbfl_uni2gb2022_tbl, mbfl_gb2022_uni_max);
if (i >= 0) {
unsigned int c1 = w - mbfl_gb2022_uni_ofst[i];
s = (c1 % 10) + 0x30;
c1 /= 10;
s |= ((c1 % 126) + 0x81) << 8;
c1 /= 126;
s |= ((c1 % 10) + 0x30) << 16;
c1 /= 10;
s |= (c1 + 0x81) << 24;
}
} else if (w >= 0x10000 && w <= 0x10FFFF) {
/* Code set 3: Unicode U+10000-U+10FFFF */
unsigned int c1 = w - 0x10000;
s = (c1 % 10) + 0x30;
c1 /= 10;
s |= ((c1 % 126) + 0x81) << 8;
c1 /= 126;
s |= ((c1 % 10) + 0x30) << 16;
c1 /= 10;
s |= (c1 + 0x90) << 24;
}
if (!s) {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
} else if (s < 0x80) {
out = mb_convert_buf_add(out, s);
} else if (s > 0xFFFFFF) {
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
} else {
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
}
}
MB_CONVERT_BUF_STORE(buf, out, limit);
}
/* Step through a GB18030 string one character at a time. Find the last position at or
* before `limit` which falls directly after the end of a (single or multi-byte) character */
static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
@@ -11673,6 +11986,21 @@ const mbfl_encoding mbfl_encoding_cp936 = {
NULL,
};
const mbfl_encoding mbfl_encoding_gb18030_2022 = {
mbfl_no_encoding_gb18030_2022,
"GB18030-2022",
"GB18030-2022",
NULL,
NULL,
MBFL_ENCTYPE_GL_UNSAFE,
NULL,
NULL,
mb_gb18030_2022_to_wchar,
mb_wchar_to_gb18030_2022,
NULL,
mb_cut_gb18030,
};
/*
* BIG5/CP950
*/

View File

@@ -32,6 +32,7 @@ extern const mbfl_encoding mbfl_encoding_euc_kr;
extern const mbfl_encoding mbfl_encoding_uhc;
extern const mbfl_encoding mbfl_encoding_gb18030;
extern const mbfl_encoding mbfl_encoding_gb18030_2022;
extern const mbfl_encoding mbfl_encoding_cp936;
extern const mbfl_encoding mbfl_encoding_big5;
extern const mbfl_encoding mbfl_encoding_cp950;

View File

@@ -92,6 +92,34 @@ static const unsigned short mbfl_gb18030_pua_tbl[][3] = {
static const int mbfl_gb18030_pua_tbl_max = sizeof(mbfl_gb18030_pua_tbl)/(sizeof(unsigned short)*3);
static const unsigned short mbfl_gb18030_2022_pua_tbl[][3] = {
{0xe766, 0xe76b, 0xa2ab},
{0xe76d, 0xe76d, 0xa2e4},
{0xe76e, 0xe76f, 0xa2ef},
{0xe770, 0xe771, 0xa2fd},
{0xe772, 0xe77c, 0xa4f4},
{0xe77d, 0xe784, 0xa5f7},
{0xe785, 0xe78c, 0xa6b9},
{0xe797, 0xe79f, 0xa6f6},
{0xe7a0, 0xe7ae, 0xa7c2},
{0xe7af, 0xe7bb, 0xa7f2},
{0xe7bc, 0xe7c6, 0xa896},
{0xe7c9, 0xe7cc, 0xa8c1},
{0xe7cd, 0xe7e1, 0xa8ea},
{0xe7e2, 0xe7e2, 0xa958},
{0xe7e3, 0xe7e3, 0xa95b},
{0xe7e4, 0xe7e6, 0xa95d},
{0xe7f4, 0xe800, 0xa997},
{0xe801, 0xe80f, 0xa9f0},
{0xe810, 0xe814, 0xd7fa},
{0xe816, 0xe818, 0xfe51},
{0xe831, 0xe831, 0xfe6c},
{0xe83b, 0xe83b, 0xfe76},
{0xe855, 0xe855, 0xfe91}
};
static const int mbfl_gb18030_2022_pua_tbl_max = sizeof(mbfl_gb18030_2022_pua_tbl)/(sizeof(unsigned short)*3);
static const unsigned short mbfl_gb2uni_tbl[] = {
0x0000, 0x0023, 0x0024, 0x0025, 0x0026, 0x002c, 0x002d, 0x0031,
0x0032, 0x0050, 0x0051, 0x0058, 0x0059, 0x005e, 0x005f, 0x005f,
@@ -233,4 +261,285 @@ static const unsigned short mbfl_gb_uni_ofst[] = {
static const int mbfl_gb_uni_max = sizeof(mbfl_gb_uni_ofst)/sizeof(unsigned short);
static const unsigned short mbfl_uni2gb2022_tbl[] = {
0x0080, 0x00a3, 0x00a5, 0x00a6, 0x00a9, 0x00af, 0x00b2, 0x00b6,
0x00b8, 0x00d6, 0x00d8, 0x00df, 0x00e2, 0x00e7, 0x00eb, 0x00eb,
0x00ee, 0x00f1, 0x00f4, 0x00f6, 0x00f8, 0x00f8, 0x00fb, 0x00fb,
0x00fd, 0x0100, 0x0102, 0x0112, 0x0114, 0x011a, 0x011c, 0x012a,
0x012c, 0x0143, 0x0145, 0x0147, 0x0149, 0x014c, 0x014e, 0x016a,
0x016c, 0x01cd, 0x01cf, 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3,
0x01d5, 0x01d5, 0x01d7, 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db,
0x01dd, 0x01f8, 0x01fa, 0x0250, 0x0252, 0x0260, 0x0262, 0x02c6,
0x02c8, 0x02c8, 0x02cc, 0x02d8, 0x02da, 0x0390, 0x03a2, 0x03a2,
0x03aa, 0x03b0, 0x03c2, 0x03c2, 0x03ca, 0x0400, 0x0402, 0x040f,
0x0450, 0x0450, 0x0452, 0x200f, 0x2011, 0x2012, 0x2017, 0x2017,
0x201a, 0x201b, 0x201e, 0x2024, 0x2027, 0x202f, 0x2031, 0x2031,
0x2034, 0x2034, 0x2036, 0x203a, 0x203c, 0x20ab, 0x20ad, 0x2102,
0x2104, 0x2104, 0x2106, 0x2108, 0x210a, 0x2115, 0x2117, 0x2120,
0x2122, 0x215f, 0x216c, 0x216f, 0x217a, 0x218f, 0x2194, 0x2195,
0x219a, 0x2207, 0x2209, 0x220e, 0x2210, 0x2210, 0x2212, 0x2214,
0x2216, 0x2219, 0x221b, 0x221c, 0x2221, 0x2222, 0x2224, 0x2224,
0x2226, 0x2226, 0x222c, 0x222d, 0x222f, 0x2233, 0x2238, 0x223c,
0x223e, 0x2247, 0x2249, 0x224b, 0x224d, 0x2251, 0x2253, 0x225f,
0x2262, 0x2263, 0x2268, 0x226d, 0x2270, 0x2294, 0x2296, 0x2298,
0x229a, 0x22a4, 0x22a6, 0x22be, 0x22c0, 0x2311, 0x2313, 0x245f,
0x246a, 0x2473, 0x249c, 0x24ff, 0x254c, 0x254f, 0x2574, 0x2580,
0x2590, 0x2592, 0x2596, 0x259f, 0x25a2, 0x25b1, 0x25b4, 0x25bb,
0x25be, 0x25c5, 0x25c8, 0x25ca, 0x25cc, 0x25cd, 0x25d0, 0x25e1,
0x25e6, 0x2604, 0x2607, 0x2608, 0x260a, 0x263f, 0x2641, 0x2641,
0x2643, 0x2e80, 0x2e82, 0x2e83, 0x2e85, 0x2e87, 0x2e89, 0x2e8a,
0x2e8d, 0x2e96, 0x2e98, 0x2ea6, 0x2ea8, 0x2ea9, 0x2eab, 0x2ead,
0x2eaf, 0x2eb2, 0x2eb4, 0x2eb5, 0x2eb8, 0x2eba, 0x2ebc, 0x2ec9,
0x2ecb, 0x2fef, 0x2ffc, 0x2fff, 0x3004, 0x3004, 0x3018, 0x301c,
0x301f, 0x3020, 0x302a, 0x303d, 0x303f, 0x3040, 0x3094, 0x309a,
0x309f, 0x30a0, 0x30f7, 0x30fb, 0x30ff, 0x3104, 0x312a, 0x321f,
0x322a, 0x3230, 0x3232, 0x32a2, 0x32a4, 0x338d, 0x3390, 0x339b,
0x339f, 0x33a0, 0x33a2, 0x33c3, 0x33c5, 0x33cd, 0x33cf, 0x33d0,
0x33d3, 0x33d4, 0x33d6, 0x3446, 0x3448, 0x3472, 0x3474, 0x359d,
0x359f, 0x360d, 0x360f, 0x3619, 0x361b, 0x3917, 0x3919, 0x396d,
0x396f, 0x39ce, 0x39d1, 0x39de, 0x39e0, 0x3a72, 0x3a74, 0x3b4d,
0x3b4f, 0x3c6d, 0x3c6f, 0x3cdf, 0x3ce1, 0x4055, 0x4057, 0x415e,
0x4160, 0x4336, 0x4338, 0x43ab, 0x43ad, 0x43b0, 0x43b2, 0x43dc,
0x43de, 0x44d5, 0x44d7, 0x464b, 0x464d, 0x4660, 0x4662, 0x4722,
0x4724, 0x4728, 0x472a, 0x477b, 0x477d, 0x478c, 0x478e, 0x4946,
0x4948, 0x4979, 0x497b, 0x497c, 0x497e, 0x4981, 0x4984, 0x4984,
0x4987, 0x499a, 0x499c, 0x499e, 0x49a0, 0x49b5, 0x49b8, 0x4c76,
0x4c78, 0x4c9e, 0x4ca4, 0x4d12, 0x4d1a, 0x4dad, 0x4daf, 0x4dff,
0x9fa6, 0xd7ff, 0xe76c, 0xe76c, 0xe78d, 0xe78d, 0xe78e, 0xe78e,
0xe78f, 0xe78f, 0xe790, 0xe796, 0xe7c7, 0xe7c7, 0xe7c8, 0xe7c8,
0xe7e7, 0xe7f3, 0xe815, 0xe815, 0xe819, 0xe81d, 0xe81e, 0xe81e,
0xe81f, 0xe825, 0xe826, 0xe826, 0xe827, 0xe82a, 0xe82b, 0xe82c,
0xe82d, 0xe830, 0xe832, 0xe832, 0xe833, 0xe83a, 0xe83c, 0xe842,
0xe843, 0xe843, 0xe844, 0xe853, 0xe854, 0xe854, 0xe856, 0xe863,
0xe864, 0xe864, 0xe865, 0xf92b, 0xf92d, 0xf978, 0xf97a, 0xf994,
0xf996, 0xf9e6, 0xf9e8, 0xf9f0, 0xf9f2, 0xfa0b, 0xfa10, 0xfa10,
0xfa12, 0xfa12, 0xfa15, 0xfa17, 0xfa19, 0xfa1e, 0xfa22, 0xfa22,
0xfa25, 0xfa26, 0xfa2a, 0xfe2f, 0xfe32, 0xfe32, 0xfe45, 0xfe48,
0xfe53, 0xfe53, 0xfe58, 0xfe58, 0xfe67, 0xfe67, 0xfe6c, 0xff00,
0xff5f, 0xffdf, 0xffe6, 0xffff,
};
static const unsigned short mbfl_gb2022_uni_ofst[] = {
128, 129, 131, 133, 134, 135, 137, 140,
142, 144, 145, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167,
168, 171, 172, 189, 196, 213, 220, 221,
285, 286, 287, 291, 293, 295, 297, 298,
300, 301, 302, 303, 304, 305, 306, 307,
308, 320, 330, 334, 338, 339, 340, 341,
342, 343, 347, 348, 349, 354, 355, 359,
360, 361, 362, 363, 365, 369, 371, 372,
373, 374, 375, 376, 386, 426, 502, 538,
553, 556, 558, 560, 562, 564, 565, 567,
571, 573, 574, 575, 576, 577, 578, 579,
581, 582, 583, 584, 585, 586, 588, 589,
590, 602, 606, 625, 627, 636, 637, 720,
724, 810, 813, 850, 860, 861, 862, 864,
867, 868, 869, 870, 872, 873, 874, 875,
876, 877, 878, 879, 880, 882, 883, 884,
885, 886, 887, 888, 889, 890, 891, 892,
893, 894, 895, 896, 897, 898, 899, 900,
901, 902, 903, 905, 907, 908, 909, 911,
912, 917, 924, 925, 21827, 25775, 20201, 20200,
20202, 20201, 51878, 25866, 25896, 25929, 25932, 40365,
25933, 40372, 25934, 40376, 25936, 40381, 25938, 25939,
40397, 25940, 40413, 25942, 40428, 25943, 25944, 25945,
25946, 25947, 25948, 25952, 25953, 25955, 25956, 25959,
25961, 25964, 25966, 25984, 25994, 25998, 26012, 26016,
26110, 26116,
};
static const int mbfl_gb2022_uni_max = sizeof(mbfl_gb2022_uni_ofst)/sizeof(unsigned short);
const unsigned short gb18030_2022_pua_tbl1[] = {
/* 0xA2AB */
0xE766,0xE767,0xE768,0xE769,0xE76A,
0xE76B,0x2488,0x2489,0x248a,0x248b,0x248c,0x248d,0x248e,
0x248f,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,
0x2497,0x2498,0x2499,0x249a,0x249b,0x2474,0x2475,0x2476,
0x2477,0x2478,0x2479,0x247a,0x247b,0x247c,0x247d,0x247e,
0x247f,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,
0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,
0x2467,0x2468,0x2469,0xE76C,0xE76D,0x3220,0x3221,0x3222,
0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0xE76E,
0xE76F,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,
0x2167,0x2168,0x2169,0x216a,0x216b,0xE770,0xE771,0x0000,
/* 0xA340 */
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0xff01,0xff02,0xff03,0xffe5,0xff05,0xff06,0xff07,
0xff08,0xff09,0xff0a,0xff0b,0xff0c,0xff0d,0xff0e,0xff0f,
0xff10,0xff11,0xff12,0xff13,0xff14,0xff15,0xff16,0xff17,
0xff18,0xff19,0xff1a,0xff1b,0xff1c,0xff1d,0xff1e,0xff1f,
0xff20,0xff21,0xff22,0xff23,0xff24,0xff25,0xff26,0xff27,
0xff28,0xff29,0xff2a,0xff2b,0xff2c,0xff2d,0xff2e,0xff2f,
0xff30,0xff31,0xff32,0xff33,0xff34,0xff35,0xff36,0xff37,
0xff38,0xff39,0xff3a,0xff3b,0xff3c,0xff3d,0xff3e,0xff3f,
0xff40,0xff41,0xff42,0xff43,0xff44,0xff45,0xff46,0xff47,
0xff48,0xff49,0xff4a,0xff4b,0xff4c,0xff4d,0xff4e,0xff4f,
0xff50,0xff51,0xff52,0xff53,0xff54,0xff55,0xff56,0xff57,
0xff58,0xff59,0xff5a,0xff5b,0xff5c,0xff5d,0xffe3,0x0000,
/* 0xA440 */
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,
0x3048,0x3049,0x304a,0x304b,0x304c,0x304d,0x304e,0x304f,
0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,
0x3058,0x3059,0x305a,0x305b,0x305c,0x305d,0x305e,0x305f,
0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,
0x3068,0x3069,0x306a,0x306b,0x306c,0x306d,0x306e,0x306f,
0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,
0x3078,0x3079,0x307a,0x307b,0x307c,0x307d,0x307e,0x307f,
0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,
0x3088,0x3089,0x308a,0x308b,0x308c,0x308d,0x308e,0x308f,
0x3090,0x3091,0x3092,0x3093,0xE772,0xE773,0xE774,0xE775,
0xE776,0xE777,0xE778,0xE779,0xE77A,0xE77B,0xE77C,0x0000,
/* 0xA540 */
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x30a1,0x30a2,0x30a3,0x30a4,0x30a5,0x30a6,0x30a7,
0x30a8,0x30a9,0x30aa,0x30ab,0x30ac,0x30ad,0x30ae,0x30af,
0x30b0,0x30b1,0x30b2,0x30b3,0x30b4,0x30b5,0x30b6,0x30b7,
0x30b8,0x30b9,0x30ba,0x30bb,0x30bc,0x30bd,0x30be,0x30bf,
0x30c0,0x30c1,0x30c2,0x30c3,0x30c4,0x30c5,0x30c6,0x30c7,
0x30c8,0x30c9,0x30ca,0x30cb,0x30cc,0x30cd,0x30ce,0x30cf,
0x30d0,0x30d1,0x30d2,0x30d3,0x30d4,0x30d5,0x30d6,0x30d7,
0x30d8,0x30d9,0x30da,0x30db,0x30dc,0x30dd,0x30de,0x30df,
0x30e0,0x30e1,0x30e2,0x30e3,0x30e4,0x30e5,0x30e6,0x30e7,
0x30e8,0x30e9,0x30ea,0x30eb,0x30ec,0x30ed,0x30ee,0x30ef,
0x30f0,0x30f1,0x30f2,0x30f3,0x30f4,0x30f5,0x30f6,0xE77D,
0xE77E,0xE77F,0xE780,0xE781,0xE782,0xE783,0xE784,0x0000,
/* 0xA640 */
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,
0x0398,0x0399,0x039a,0x039b,0x039c,0x039d,0x039e,0x039f,
0x03a0,0x03a1,0x03a3,0x03a4,0x03a5,0x03a6,0x03a7,0x03a8,
0x03a9,0xE785,0xE786,0xE787,0xE788,0xE789,0xE78A,0xE78B,
0xE78C,0x03b1,0x03b2,0x03b3,0x03b4,0x03b5,0x03b6,0x03b7,
0x03b8,0x03b9,0x03ba,0x03bb,0x03bc,0x03bd,0x03be,0x03bf,
0x03c0,0x03c1,0x03c3,0x03c4,0x03c5,0x03c6,0x03c7,0x03c8,
0x03c9,0xFE10,0xFE12,0xFE11,0xFE13,0xFE14,0xFE15,0xFE16,
0xfe35,0xfe36,0xfe39,0xfe3a,0xfe3f,0xfe40,0xfe3d,0xfe3e,
0xfe41,0xfe42,0xfe43,0xfe44,0xFE17,0xFE18,0xfe3b,0xfe3c,
0xfe37,0xfe38,0xfe31,0xFE19,0xfe33,0xfe34,0xE797,0xE798,
0xE799,0xE79A,0xE79B,0xE79C,0xE79D,0xE79E,0xE79F,0x0000,
/* 0xA740 */
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,
0x0416,0x0417,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,
0x041e,0x041f,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,
0x0426,0x0427,0x0428,0x0429,0x042a,0x042b,0x042c,0x042d,
0x042e,0x042f,0xE7A0,0xE7A1,0xE7A2,0xE7A3,0xE7A4,0xE7A5,
0xE7A6,0xE7A7,0xE7A8,0xE7A9,0xE7AA,0xE7AB,0xE7AC,0xE7AD,
0xE7AE,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,
0x0436,0x0437,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,
0x043e,0x043f,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,
0x0446,0x0447,0x0448,0x0449,0x044a,0x044b,0x044c,0x044d,
0x044e,0x044f,0xE7AF,0xE7B0,0xE7B1,0xE7B2,0xE7B3,0xE7B4,
0xE7B5,0xE7B6,0xE7B7,0xE7B8,0xE7B9,0xE7BA,0xE7BB,0x0000,
/* 0xA840 */
0x02ca,0x02cb,0x02d9,0x2013,0x2015,0x2025,0x2035,0x2105,
0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221f,0x2223,
0x2252,0x2266,0x2267,0x22bf,0x2550,0x2551,0x2552,0x2553,
0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,
0x255c,0x255d,0x255e,0x255f,0x2560,0x2561,0x2562,0x2563,
0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,
0x256c,0x256d,0x256e,0x256f,0x2570,0x2571,0x2572,0x2573,
0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,
0x2588,0x2589,0x258a,0x258b,0x258c,0x258d,0x258e,0x258f,
0x2593,0x2594,0x2595,0x25bc,0x25bd,0x25e2,0x25e3,0x25e4,
0x25e5,0x2609,0x2295,0x3012,0x301d,0x301e,0xE7BC,0xE7BD,
0xE7BE,0xE7BF,0xE7C0,0xE7C1,0xE7C2,0xE7C3,0xE7C4,0xE7C5,
0xE7C6,0x0101,0x00e1,0x01ce,0x00e0,0x0113,0x00e9,0x011b,
0x00e8,0x012b,0x00ed,0x01d0,0x00ec,0x014d,0x00f3,0x01d2,
0x00f2,0x016b,0x00fa,0x01d4,0x00f9,0x01d6,0x01d8,0x01da,
0x01dc,0x00fc,0x00ea,0x0251,0x1E3F,0x0144,0x0148,0x01F9,
0x0261,0xE7C9,0xE7CA,0xE7CB,0xE7CC,0x3105,0x3106,0x3107,
0x3108,0x3109,0x310a,0x310b,0x310c,0x310d,0x310e,0x310f,
0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,
0x3118,0x3119,0x311a,0x311b,0x311c,0x311d,0x311e,0x311f,
0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,
0x3128,0x3129,0xE7CD,0xE7CE,0xE7CF,0xE7D0,0xE7D1,0xE7D2,
0xE7D3,0xE7D4,0xE7D5,0xE7D6,0xE7D7,0xE7D8,0xE7D9,0xE7DA,
0xE7DB,0xE7DC,0xE7DD,0xE7DE,0xE7DF,0xE7E0,0xE7E1,0x0000,
/* 0xA940 */
0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,
0x3029,0x32a3,0x338e,0x338f,0x339c,0x339d,0x339e,0x33a1,
0x33c4,0x33ce,0x33d1,0x33d2,0x33d5,0xfe30,0xffe2,0xffe4,
0xE7E2,0x2121,0x3231,0xE7E3,0x2010,0xE7E4,0xE7E5,0xE7E6,
0x30fc,0x309b,0x309c,0x30fd,0x30fe,0x3006,0x309d,0x309e,
0xfe49,0xfe4a,0xfe4b,0xfe4c,0xfe4d,0xfe4e,0xfe4f,0xfe50,
0xfe51,0xfe52,0xfe54,0xfe55,0xfe56,0xfe57,0xfe59,0xfe5a,
0xfe5b,0xfe5c,0xfe5d,0xfe5e,0xfe5f,0xfe60,0xfe61,0x0000,
0xfe62,0xfe63,0xfe64,0xfe65,0xfe66,0xfe68,0xfe69,0xfe6a,
0xfe6b,0x303E,0x2FF0,0x2FF1,0x2FF2,0x2FF3,0x2FF4,0x2FF5,
0x2FF6,0x2FF7,0x2FF8,0x2FF9,0x2FFA,0x2FFB,0x3007,0xE7F4,
0xE7F5,0xE7F6,0xE7F7,0xE7F8,0xE7F9,0xE7FA,0xE7FB,0xE7FC,
0xE7FD,0xE7FE,0xE7FF,0xE800,0x2500,0x2501,0x2502,0x2503,
0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250a,0x250b,
0x250c,0x250d,0x250e,0x250f,0x2510,0x2511,0x2512,0x2513,
0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251a,0x251b,
0x251c,0x251d,0x251e,0x251f,0x2520,0x2521,0x2522,0x2523,
0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252a,0x252b,
0x252c,0x252d,0x252e,0x252f,0x2530,0x2531,0x2532,0x2533,
0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253a,0x253b,
0x253c,0x253d,0x253e,0x253f,0x2540,0x2541,0x2542,0x2543,
0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254a,0x254b,
0xE801,0xE802,0xE803,0xE804,0xE805,0xE806,0xE807,0xE808,
0xE809,0xE80A,0xE80B,0xE80C,0xE80D,0xE80E,0xE80F,0x0000,
};
#endif /* UNICODE_TABLE_GB18030_H */

View File

@@ -88,10 +88,10 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
&mbfl_encoding_sjis,
&mbfl_encoding_eucjp_win,
&mbfl_encoding_eucjp2004,
&mbfl_encoding_sjis_docomo,
&mbfl_encoding_sjis_kddi,
&mbfl_encoding_sjis_sb,
&mbfl_encoding_sjis_mac,
&mbfl_encoding_sjis_docomo,
&mbfl_encoding_sjis_kddi,
&mbfl_encoding_sjis_sb,
&mbfl_encoding_sjis_mac,
&mbfl_encoding_sjis2004,
&mbfl_encoding_utf8_docomo,
&mbfl_encoding_utf8_kddi_a,
@@ -104,6 +104,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
&mbfl_encoding_2022jp,
&mbfl_encoding_2022jpms,
&mbfl_encoding_gb18030,
&mbfl_encoding_gb18030_2022,
&mbfl_encoding_cp1252,
&mbfl_encoding_cp1254,
&mbfl_encoding_8859_1,
@@ -148,145 +149,176 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
* Command used: gperf encodings.txt --readonly-tables --null-strings --ignore-case
* The encodings.txt contains all the contents of the name fields of the mbfl_encoding_ptr_list table. */
static const int8_t mbfl_encoding_ptr_list_after_hashing[187] = {
-1, -1, -1,
65,
static const int8_t mbfl_encoding_ptr_list_after_hashing[231] = {
-1, -1,
61,
66,
23,
73,
59,
-1,
1,
-1, -1, -1,
11,
-1,
5,
9,
-1,
60,
36,
-1, -1,
58,
42,
-1, -1,
18,
27,
77,
26,
40,
72,
12,
10,
2,
31,
-1, -1,
75,
74,
33,
45,
-1,
67,
13,
-1,
51,
53,
11,
1,
-1,
48,
56,
-1,
38,
20,
46,
-1,
52,
54,
-1,
14,
24,
44,
39,
43,
2,
40,
46,
27,
76,
26,
-1,
30,
49,
57,
76,
-1, -1,
68,
73,
7,
16,
-1,
35,
66,
-1, -1, -1,
75,
-1,
47,
55,
-1, -1, -1,
63,
15,
78,
36,
-1,
50,
58,
8,
17,
-1,
21,
70,
-1,
29,
5,
-1, -1,
69,
39,
7,
-1, -1,
64,
67,
-1, -1,
30,
48,
56,
-1, -1, -1,
35,
74,
-1, -1,
24,
53,
62,
43,
-1, -1,
45,
22,
-1, -1, -1,
6,
61,
-1, -1,
71,
52,
3,
-1, -1, -1,
18,
71,
-1, -1, -1,
21,
-1,
37,
-1,
4,
60,
25,
-1, -1,
72,
51,
-1,
44,
29,
-1,
28,
0,
-1,
14,
31,
63,
12,
-1,
13,
33,
-1, -1,
68,
-1, -1, -1, -1,
-1, -1,
20,
-1, -1, -1, -1,
-1, -1, -1,
77,
-1, -1, -1, -1,
-1, -1, -1, -1,
65,
-1, -1, -1, -1,
70,
-1, -1, -1, -1,
-1,
41,
-1, -1, -1, -1,
-1,
17,
-1, -1, -1,
42,
16,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
15,
-1, -1, -1, -1,
34,
-1, -1, -1, -1,
-1, -1, -1,
32,
50,
34,
-1, -1, -1,
62,
-1, -1, -1, -1, -1, -1, -1, -1, -1,
59,
0,
-1, -1, -1, -1,
22,
-1, -1, -1, -1, -1, -1, -1, -1, -1,
25,
41,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
19,
-1, -1, -1,
4,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
69,
-1, -1, -1, -1,
64,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1, -1, -1, -1,
-1,
19
};
static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
{
static const unsigned char asso_values[] =
{
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 0, 187, 187, 5, 20,
0, 15, 40, 10, 25, 70, 5, 60, 187, 187,
187, 187, 187, 187, 187, 75, 5, 0, 20, 5,
0, 75, 5, 0, 40, 75, 20, 0, 0, 0,
35, 45, 50, 0, 75, 0, 187, 0, 187, 187,
0, 187, 187, 187, 187, 187, 187, 75, 5, 0,
20, 5, 0, 75, 5, 0, 40, 75, 20, 0,
0, 0, 35, 45, 50, 0, 75, 0, 187, 0,
187, 187, 0, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 5, 231, 231, 0, 50,
5, 15, 35, 10, 20, 75, 0, 45, 231, 231,
231, 231, 231, 231, 231, 80, 5, 0, 0, 0,
75, 75, 0, 0, 15, 70, 0, 5, 0, 0,
25, 55, 30, 0, 10, 0, 231, 25, 231, 231,
0, 231, 231, 231, 231, 231, 231, 80, 5, 0,
0, 0, 75, 75, 0, 0, 15, 70, 0, 5,
0, 0, 25, 55, 30, 0, 10, 0, 231, 25,
231, 231, 0, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
231, 231, 231, 231, 231, 231
};
unsigned int hval = len;
@@ -345,7 +377,7 @@ const mbfl_encoding *mbfl_name2encoding_ex(const char *name, size_t name_len)
/* Use perfect hash lookup for name */
if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
if (key <= 186) {
if (key <= sizeof(mbfl_encoding_ptr_list_after_hashing)) {
int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
if (offset >= 0) {
encoding = mbfl_encoding_ptr_list + offset;

View File

@@ -84,6 +84,7 @@ enum mbfl_no_encoding {
mbfl_no_encoding_2022jp_kddi,
mbfl_no_encoding_2022jpms,
mbfl_no_encoding_gb18030,
mbfl_no_encoding_gb18030_2022,
mbfl_no_encoding_cp1252,
mbfl_no_encoding_cp1254,
mbfl_no_encoding_8859_1,

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,310 @@
--TEST--
Exhaustive test of verification and conversion of GB18030-2022 text
--EXTENSIONS--
mbstring
--SKIPIF--
<?php
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
include('encoding_tests.inc');
srand(2323); // Make results consistent
mb_substitute_character(0x25); // '%'
$updatedMappings = [
"\xA6\xD9" => "\xFE\x10",
"\xA6\xDA" => "\xFE\x12",
"\xA6\xDB" => "\xFE\x11",
"\xA6\xDC" => "\xFE\x13",
"\xA6\xDD" => "\xFE\x14",
"\xA6\xDE" => "\xFE\x15",
"\xA6\xDF" => "\xFE\x16",
"\xA6\xEC" => "\xFE\x17",
"\xA6\xED" => "\xFE\x18",
"\xA6\xF3" => "\xFE\x19",
"\xA8\xBC" => "\x1E\x3F",
"\xA8\xBF" => "\x01\xF9",
"\xA9\x89" => "\x30\x3E",
"\xA9\x8A" => "\x2F\xF0",
"\xA9\x8B" => "\x2F\xF1",
"\xA9\x8C" => "\x2F\xF2",
"\xA9\x8D" => "\x2F\xF3",
"\xA9\x8E" => "\x2F\xF4",
"\xA9\x8F" => "\x2F\xF5",
"\xA9\x90" => "\x2F\xF6",
"\xA9\x91" => "\x2F\xF7",
"\xA9\x92" => "\x2F\xF8",
"\xA9\x93" => "\x2F\xF9",
"\xA9\x94" => "\x2F\xFA",
"\xA9\x95" => "\x2F\xFB",
"\xFE\x50" => "\x2E\x81",
"\xFE\x51" => "\xE8\x16",
"\xFE\x52" => "\xE8\x17",
"\xFE\x53" => "\xE8\x18",
"\xFE\x54" => "\x2E\x84",
"\xFE\x55" => "\x34\x73",
"\xFE\x56" => "\x34\x47",
"\xFE\x57" => "\x2E\x88",
"\xFE\x58" => "\x2E\x8B",
"\xFE\x59" => "\x9F\xB4",
"\xFE\x5A" => "\x35\x9E",
"\xFE\x5B" => "\x36\x1A",
"\xFE\x5C" => "\x36\x0E",
"\xFE\x5D" => "\x2E\x8C",
"\xFE\x5E" => "\x2E\x97",
"\xFE\x5F" => "\x39\x6E",
"\xFE\x60" => "\x39\x18",
"\xFE\x61" => "\x9F\xB5",
"\xFE\x62" => "\x39\xCF",
"\xFE\x63" => "\x39\xDF",
"\xFE\x64" => "\x3A\x73",
"\xFE\x65" => "\x39\xD0",
"\xFE\x66" => "\x9F\xB6",
"\xFE\x67" => "\x9F\xB7",
"\xFE\x68" => "\x3B\x4E",
"\xFE\x69" => "\x3C\x6E",
"\xFE\x6A" => "\x3C\xE0",
"\xFE\x6B" => "\x2E\xA7",
"\xFE\x6C" => "\xE8\x31",
"\xFE\x6D" => "\x9F\xB8",
"\xFE\x6E" => "\x2E\xAA",
"\xFE\x6F" => "\x40\x56",
"\xFE\x76" => "\xE8\x3B",
"\xFE\x7E" => "\x9F\xB9",
"\xFE\x90" => "\x9F\xBA",
"\xFE\x91" => "\xE8\x55",
"\xFE\xA0" => "\x9F\xBB"];
testAllValidChars($updatedMappings, 'GB18030-2022', 'UTF-16BE', false);
testAllValidChars(array_flip($updatedMappings), 'UTF-16BE', 'GB18030-2022', false);
$sampleSMP = [
"\x00\x10\x03\x08" => "\xDE\x30\xE6\x36",
"\x00\x10\x14\xEB" => "\xDE\x34\xB8\x35",
"\x00\x10\x29\x76" => "\xDE\x38\xCE\x34",
"\x00\x10\x40\x6E" => "\xDF\x33\xA4\x34",
"\x00\x10\x78\x7B" => "\xE0\x34\xD5\x33",
"\x00\x01\x25\x2A" => "\x90\x37\xC6\x34",
"\x00\x01\x5B\xA4" => "\x91\x38\xCF\x30",
"\x00\x01\x6D\x81" => "\x92\x32\xA0\x33",
"\x00\x01\x7F\xB2" => "\x92\x35\xF8\x30",
"\x00\x01\x89\x9B" => "\x92\x37\xF9\x37",
"\x00\x01\x9E\x77" => "\x93\x32\x99\x37",
"\x00\x02\x08\x9A" => "\x95\x33\xE0\x38",
"\x00\x02\x1B\x00" => "\x95\x37\xBF\x38",
"\x00\x02\x31\xBE" => "\x96\x32\x90\x30",
"\x00\x02\x64\xD4" => "\x97\x32\xBF\x38",
"\x00\x02\xA9\xA0" => "\x98\x36\xBD\x30",
"\x00\x02\xBA\x38" => "\x98\x39\xEB\x38",
"\x00\x03\x1C\x13" => "\x9A\x39\xDC\x39",
"\x00\x03\x20\x6D" => "\x9B\x30\xCE\x33",
"\x00\x03\x22\xA9" => "\x9B\x31\x89\x35",
"\x00\x03\x39\xB3" => "\x9B\x35\xDF\x33",
"\x00\x03\xA7\xF2" => "\x9D\x38\x93\x36",
"\x00\x03\xDF\xFB" => "\x9E\x39\xC4\x31",
"\x00\x04\x01\x69" => "\x9F\x36\xA9\x39",
"\x00\x04\x23\x79" => "\xA0\x33\x9F\x39",
"\x00\x04\x26\x52" => "\xA0\x33\xE8\x38",
"\x00\x04\x38\xDB" => "\xA0\x37\xCB\x33",
"\x00\x04\x46\x84" => "\xA1\x30\xAF\x30",
"\x00\x04\x6C\x7C" => "\xA1\x38\x8B\x30",
"\x00\x04\x78\x41" => "\xA2\x30\xBC\x33",
"\x00\x04\x97\x32" => "\xA2\x36\xE0\x34",
"\x00\x04\x9E\xCC" => "\xA2\x38\xA7\x30",
"\x00\x04\xC5\xDB" => "\xA3\x36\x9E\x39",
"\x00\x04\xF4\xE2" => "\xA4\x35\xE4\x38",
"\x00\x05\x3B\xA6" => "\xA6\x30\x96\x34",
"\x00\x05\x76\x53" => "\xA7\x32\x8C\x35",
"\x00\x05\xEA\x9F" => "\xA9\x35\xDB\x37",
"\x00\x06\x12\x29" => "\xAA\x33\xDF\x39",
"\x00\x06\x1B\x9E" => "\xAA\x35\xD6\x30",
"\x00\x06\x3B\x26" => "\xAB\x32\x8B\x32",
"\x00\x06\x4C\xA8" => "\xAB\x35\xD1\x34",
"\x00\x06\x63\x3E" => "\xAC\x30\x9D\x36",
"\x00\x06\xB3\xA1" => "\xAD\x36\xC7\x35",
"\x00\x07\x0A\x31" => "\xAF\x34\x93\x35",
"\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
"\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
"\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
"\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
"\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
"\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
"\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
"\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
"\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
"\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
"\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
"\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
"\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
"\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
"\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
"\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
"\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
"\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
"\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
"\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
"\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
"\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
"\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
"\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
"\x00\x09\xBA\x2B" => "\xBD\x33\xF5\x37",
"\x00\x0A\x26\x00" => "\xBF\x35\xEA\x32",
"\x00\x0A\x36\xE9" => "\xBF\x39\xA3\x31",
"\x00\x0A\x7A\x20" => "\xC1\x32\xF5\x38",
"\x00\x0A\x9C\x93" => "\xC1\x39\xF5\x37",
"\x00\x0A\xC0\xD7" => "\xC2\x37\xA6\x31",
"\x00\x0A\xD8\x77" => "\xC3\x32\x8C\x39",
"\x00\x0B\x1A\x9B" => "\xC4\x35\xC4\x31",
"\x00\x0B\x4F\x27" => "\xC5\x36\x9B\x33",
"\x00\x0B\x72\x6D" => "\xC6\x33\xB0\x33",
"\x00\x0B\xEE\x23" => "\xC8\x38\xC1\x33",
"\x00\x0B\xF0\xDF" => "\xC8\x39\x89\x33",
"\x00\x0C\x0B\xE1" => "\xC9\x34\xC6\x37",
"\x00\x0C\x4C\x98" => "\xCA\x37\xD9\x34",
"\x00\x0C\x5F\x41" => "\xCB\x31\xBF\x31",
"\x00\x0C\x63\xE4" => "\xCB\x32\xB7\x38",
"\x00\x0C\x70\x0A" => "\xCB\x34\xF2\x38",
"\x00\x0C\xAD\x6A" => "\xCC\x37\xB0\x30",
"\x00\x0C\xCC\x03" => "\xCD\x33\xCB\x33",
"\x00\x0C\xD5\x4C" => "\xCD\x35\xBD\x30",
"\x00\x0C\xE6\x70" => "\xCD\x38\xF9\x38",
"\x00\x0D\x1B\x6A" => "\xCE\x39\xDC\x30",
"\x00\x0D\x55\xEE" => "\xD0\x31\xCE\x30",
"\x00\x0D\xBB\xB1" => "\xD2\x32\xA5\x31",
"\x00\x0D\xC0\x4F" => "\xD2\x33\x9D\x33",
"\x00\x0D\xFA\x84" => "\xD3\x35\x87\x34",
"\x00\x0E\x16\x71" => "\xD4\x30\xDC\x33",
"\x00\x0E\x1E\x03" => "\xD4\x32\xA2\x31",
"\x00\x0E\x20\xE8" => "\xD4\x32\xEC\x32",
"\x00\x0E\x39\x6A" => "\xD4\x37\xE9\x36",
"\x00\x0E\x6A\x95" => "\xD5\x37\xE8\x33",
"\x00\x0E\x7E\xCD" => "\xD6\x31\xF5\x39",
"\x00\x0E\x80\x69" => "\xD6\x32\xA1\x31",
"\x00\x0E\x9A\x7F" => "\xD6\x37\xC6\x39",
"\x00\x0E\xEE\x12" => "\xD8\x34\xC4\x34",
"\x00\x0E\xFC\xA1" => "\xD8\x37\xBF\x31",
"\x00\x0F\x29\xB0" => "\xD9\x36\xD2\x36",
"\x00\x0F\x2A\x12" => "\xD9\x36\xDC\x34",
"\x00\x0F\x6C\x8C" => "\xDB\x30\x9E\x32",
"\x00\x0F\xAF\x04" => "\xDC\x33\xDD\x38",
"\x00\x0F\xBE\x65" => "\xDC\x36\xED\x35",
"\x00\x0F\xE5\x88" => "\xDD\x34\xE7\x34",
"\x00\x0F\xE7\xB1" => "\xDD\x35\xA0\x37",
"\x00\x0F\xF4\x27" => "\xDD\x37\xE3\x37"];
testAllValidChars($sampleSMP, 'UTF-32BE', 'GB18030-2022', false);
function readGB18030_2022_ConversionTable($path, &$from, &$to, $utf32 = false) {
$from = [];
$to = [];
$fp = fopen($path, 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
continue;
if (sscanf($line, "%x\t%x", $codepoint, $char) == 2) {
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
if ($char == PHP_INT_MAX) {
// We may be on a 32-bit machine and testing a text encoding with 4-byte codes
// (which can't be represented in a PHP integer)
$char = "";
for ($i = 2; $i < strlen($line); $i += 2) {
$substr = substr($line, $i, 2);
if (ctype_xdigit($substr))
$char .= chr(hexdec($substr));
else
break;
}
} else {
if ($char <= 0xFF)
$char = chr($char); // hex codes must not have leading zero bytes
else if ($char <= 0xFFFF)
$char = pack('n', $char);
else if ($char <= 0xFFFFFF)
$char = chr($char >> 16) . pack('n', $char & 0xFFFF);
else
$char = pack('N', $char);
}
$from[$char] = $codepoint;
$to[$codepoint] = $char;
}
}
}
readGB18030_2022_ConversionTable(__DIR__ . '/data/GB18030-2022MappingTableBMP.txt', $toUnicode, $fromUnicode);
// We will test 4-byte codes separately
findInvalidChars($toUnicode, $invalid, $truncated);
function notFourByteCode($gb) {
return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) ||
(strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39));
}
$invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
$truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
testAllValidChars($toUnicode, 'GB18030-2022', 'UTF-16BE', false);
testAllInvalidChars($invalid, $toUnicode, 'GB18030-2022', 'UTF-16BE', "\x00%");
testTruncatedChars($truncated, 'GB18030-2022', 'UTF-16BE', "\x00%");
echo "Tested GB18030-2022 (BMP) -> UTF-16BE\n";
// Test one random 4-byte code for each range used for Unicode codepoints in BMP
function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) {
return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30);
}
function fourByteCodeFromIndex($index) {
$quotient = intdiv($index, 10 * 126 * 10);
$byte4 = $quotient + 0x81;
$index -= ($quotient * 10 * 126 * 10);
$quotient = intdiv($index, 10 * 126);
$byte3 = $quotient + 0x30;
$index -= ($quotient * 10 * 126);
$quotient = intdiv($index, 10);
$byte2 = $quotient + 0x81;
$byte1 = $index - ($quotient * 10) + 0x30;
return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1);
}
// Invalid 4-byte codes in range for BMP
testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
// Valid 4-byte codes for other Unicode planes
testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030-2022", "UTF-32BE");
testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030-2022", "UTF-32BE");
// Invalid 4-byte codes for other Unicode planes
testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030-2022", "UTF-32BE");
echo "Tested GB18030-2022 (SMP) <-> UTF-32BE\n";
testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030-2022', false);
echo "Tested UTF-16BE -> GB18030-2022 (BMP)\n";
convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030-2022");
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030-2022", "UTF-8");
convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030-2022", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested GB18030-2022 (BMP) -> UTF-16BE
Tested GB18030-2022 (SMP) <-> UTF-32BE
Tested UTF-16BE -> GB18030-2022 (BMP)
Done!