mirror of
https://github.com/php/php-src.git
synced 2026-03-24 08:12:21 +01:00
Add mbstring support for GB18030-2022 text encoding
The previous version of the GB-18030 standard was published in 2005. This commit adds support for the updated (2022) version of this text encoding. The existing GB18030 implementation has been left unchanged for backwards compatibility; users who want to use the new standard must explicitly indicate the desired text encoding is 'GB18030-2022'. The document which defines GB18030-2022, published by the government of the People's Republic of China, defines three levels of standards compliance. This implementation is intended to achieve Implementation Level 3, which is the highest level of compliance. Experts in the GB18030 standard are requested to assess this implementation and report any deviation from the standard.
This commit is contained in:
@@ -11088,7 +11088,7 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b
|
||||
continue;
|
||||
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
|
||||
if (w == 0x1F9) {
|
||||
s = 0xA8Bf;
|
||||
s = 0xA8BF;
|
||||
} else {
|
||||
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
|
||||
}
|
||||
@@ -11560,6 +11560,319 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
static const unsigned short gb18030_2022_pua_tbl3[] = {
|
||||
/* 0xFE50 */
|
||||
0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x9FB4,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x9FB5,0x0000,0x0000,0x0000,0x0000,0x9FB6,0x9FB7,
|
||||
0x0000,0x0000,0x0000,0x0000,0xE831,0x9FB8,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x9FB9,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x9FBA,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
/* 0xFEA0 */
|
||||
0x9FBB
|
||||
};
|
||||
|
||||
static size_t mb_gb18030_2022_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
{
|
||||
unsigned char *p = *in, *e = p + *in_len;
|
||||
uint32_t *out = buf, *limit = buf + bufsize;
|
||||
|
||||
while (p < e && out < limit) {
|
||||
unsigned char c = *p++;
|
||||
|
||||
if (c < 0x80) {
|
||||
*out++ = c;
|
||||
} else if (c == 0x80 || c == 0xFF) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
} else {
|
||||
if (p == e) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
break;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
|
||||
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
|
||||
if (p >= e) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
break;
|
||||
}
|
||||
unsigned char c3 = *p++;
|
||||
|
||||
if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
|
||||
unsigned char c4 = *p++;
|
||||
|
||||
if (c4 >= 0x30 && c4 <= 0x39) {
|
||||
if (c >= 0x90 && c <= 0xE3) {
|
||||
unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
|
||||
*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
|
||||
} else {
|
||||
/* Unicode BMP */
|
||||
unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
|
||||
if (w == 0x98A4) {
|
||||
*out++ = 0xE78D;
|
||||
} else if (w == 0x98A6) {
|
||||
*out++ = 0xE78E;
|
||||
} else if (w == 0x98A5) {
|
||||
*out++ = 0xE78F;
|
||||
} else if (w >= 0x98A7 && w <= 0x98AD) {
|
||||
*out++ = w + (0xE790 - 0x98A7);
|
||||
} else if (w == 0x1D21) {
|
||||
*out++ = 0xE7C7;
|
||||
} else if (w == 0x4A71) {
|
||||
*out++ = 0xE81E;
|
||||
} else if (w == 0x4A72) {
|
||||
*out++ = 0xE826;
|
||||
} else if (w >= 0x4A73 && w <= 0x4A74) {
|
||||
*out++ = w + (0xE82B - 0x4A73);
|
||||
} else if (w == 0x4A75) {
|
||||
*out++ = 0xE832;
|
||||
} else if (w == 0x4A76) {
|
||||
*out++ = 0xE843;
|
||||
} else if (w == 0x4A77) {
|
||||
*out++ = 0xE854;
|
||||
} else if (w == 0x4A78) {
|
||||
*out++ = 0xE864;
|
||||
} else if (w <= 0x99FB) {
|
||||
*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
}
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
}
|
||||
} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
|
||||
/* UDA part 1, 2: U+E000-U+E4C5 */
|
||||
*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
|
||||
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
|
||||
/* UDA part 3: U+E4C6-U+E765 */
|
||||
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
|
||||
} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
|
||||
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
|
||||
|
||||
if (w >= 0x192B) {
|
||||
if (w <= 0x1EBE) {
|
||||
if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
|
||||
*out++ = gb18030_2022_pua_tbl1[w - 0x192B];
|
||||
continue;
|
||||
}
|
||||
} else if (w >= 0x413A) {
|
||||
if (w <= 0x413E) {
|
||||
*out++ = cp936_pua_tbl2[w - 0x413A];
|
||||
continue;
|
||||
} else if (w >= 0x5DD0 && w <= 0x5E20) {
|
||||
unsigned int c = gb18030_2022_pua_tbl3[w - 0x5DD0];
|
||||
if (c) {
|
||||
*out++ = c;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
|
||||
ZEND_ASSERT(w < cp936_ucs_table_size);
|
||||
*out++ = cp936_ucs_table[w];
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
}
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*in_len = e - p;
|
||||
*in = p;
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
static void mb_wchar_to_gb18030_2022(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
{
|
||||
unsigned char *out, *limit;
|
||||
MB_CONVERT_BUF_LOAD(buf, out, limit);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
|
||||
|
||||
while (len--) {
|
||||
uint32_t w = *in++;
|
||||
unsigned int s = 0;
|
||||
|
||||
if (w == 0) {
|
||||
out = mb_convert_buf_add(out, 0);
|
||||
continue;
|
||||
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
|
||||
if (w == 0x1F9) {
|
||||
s = 0xA8BF;
|
||||
} else {
|
||||
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
|
||||
}
|
||||
} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
|
||||
if (w == 0x20AC) { /* Euro sign */
|
||||
s = 0xA2E3;
|
||||
} else {
|
||||
s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
|
||||
}
|
||||
} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
|
||||
s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
|
||||
} else if (w >= 0x9FB4 && w <= 0x9FBB) {
|
||||
/* Newly mapped in GB18030-2022 */
|
||||
if (w == 0x9FB4) {
|
||||
s = 0xFE59;
|
||||
} else if (w == 0x9FB5) {
|
||||
s = 0xFE61;
|
||||
} else if (w == 0x9FB6) {
|
||||
s = 0xFE66;
|
||||
} else if (w == 0x9FB7) {
|
||||
s = 0xFE67;
|
||||
} else if (w == 0x9FB8) {
|
||||
s = 0xFE6D;
|
||||
} else if (w == 0x9FB9) {
|
||||
s = 0xFE7E;
|
||||
} else if (w == 0x9FBA) {
|
||||
s = 0xFE90;
|
||||
} else {
|
||||
s = 0xFEA0;
|
||||
}
|
||||
} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
|
||||
s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
|
||||
} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
|
||||
/* U+F900-U+FA2F CJK Compatibility Ideographs */
|
||||
if (w == 0xF92C) {
|
||||
s = 0xFD9C;
|
||||
} else if (w == 0xF979) {
|
||||
s = 0xFD9D;
|
||||
} else if (w == 0xF995) {
|
||||
s = 0xFD9E;
|
||||
} else if (w == 0xF9E7) {
|
||||
s = 0xFD9F;
|
||||
} else if (w == 0xF9F1) {
|
||||
s = 0xFDA0;
|
||||
} else if (w >= 0xFA0C && w <= 0xFA29) {
|
||||
s = ucs_ci_s_cp936_table[w - 0xFA0C];
|
||||
}
|
||||
} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
|
||||
/* CJK Compatibility Forms */
|
||||
s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
|
||||
} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
|
||||
/* U+FE50-U+FE6F Small Form Variants */
|
||||
s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
|
||||
} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
|
||||
/* U+FF00-U+FFFF HW/FW Forms */
|
||||
if (w == 0xFF04) {
|
||||
s = 0xA1E7;
|
||||
} else if (w == 0xFF5E) {
|
||||
s = 0xA1AB;
|
||||
} else if (w >= 0xFF01 && w <= 0xFF5D) {
|
||||
s = w - 0xFF01 + 0xA3A1;
|
||||
} else if (w >= 0xFFE0 && w <= 0xFFE5) {
|
||||
s = ucs_hff_s_cp936_table[w - 0xFFE0];
|
||||
}
|
||||
} else if (w >= 0xE000 && w <= 0xE864) {
|
||||
/* PUA */
|
||||
if (w < 0xE766) {
|
||||
if (w < 0xE4C6) {
|
||||
unsigned int c1 = w - 0xE000;
|
||||
s = (c1 % 94) + 0xA1;
|
||||
c1 /= 94;
|
||||
s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
|
||||
} else {
|
||||
unsigned int c1 = w - 0xE4C6;
|
||||
s = ((c1 / 96) + 0xA1) << 8;
|
||||
c1 %= 96;
|
||||
s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
|
||||
}
|
||||
} else {
|
||||
/* U+E766-U+E864 */
|
||||
unsigned int k1 = 0, k2 = mbfl_gb18030_2022_pua_tbl_max;
|
||||
while (k1 < k2) {
|
||||
unsigned int k = (k1 + k2) >> 1;
|
||||
if (w < mbfl_gb18030_2022_pua_tbl[k][0]) {
|
||||
k2 = k;
|
||||
} else if (w > mbfl_gb18030_2022_pua_tbl[k][1]) {
|
||||
k1 = k + 1;
|
||||
} else {
|
||||
s = w - mbfl_gb18030_2022_pua_tbl[k][0] + mbfl_gb18030_2022_pua_tbl[k][2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (w >= 0xFE10 && w <= 0xFE19) {
|
||||
/* Newly mapped codepoints in GB18030-2022 */
|
||||
if (w == 0xFE11) {
|
||||
s = 0xA6DB;
|
||||
} else if (w == 0xFE12) {
|
||||
s = 0xA6DA;
|
||||
} else if (w <= 0xFE16) {
|
||||
s = w - (0xFE10 - 0xA6D9);
|
||||
} else if (w <= 0xFE18) {
|
||||
s = w - (0xFE17 - 0xA6EC);
|
||||
} else {
|
||||
s = 0xA6F3;
|
||||
}
|
||||
} else if (w == 0x1E3F) {
|
||||
/* Newly mapped codepoint in GB18030-2022 */
|
||||
s = 0xA8BC;
|
||||
}
|
||||
|
||||
/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
|
||||
* do a binary search in a table of differing codepoints to see if we have one */
|
||||
if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
|
||||
int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
|
||||
if (i >= 0) {
|
||||
s = mbfl_gb18030_c_tbl_val[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
|
||||
if (!s && w >= 0x80 && w <= 0xFFFF) {
|
||||
/* BMP */
|
||||
int i = mbfl_bisec_srch(w, mbfl_uni2gb2022_tbl, mbfl_gb2022_uni_max);
|
||||
if (i >= 0) {
|
||||
unsigned int c1 = w - mbfl_gb2022_uni_ofst[i];
|
||||
s = (c1 % 10) + 0x30;
|
||||
c1 /= 10;
|
||||
s |= ((c1 % 126) + 0x81) << 8;
|
||||
c1 /= 126;
|
||||
s |= ((c1 % 10) + 0x30) << 16;
|
||||
c1 /= 10;
|
||||
s |= (c1 + 0x81) << 24;
|
||||
}
|
||||
} else if (w >= 0x10000 && w <= 0x10FFFF) {
|
||||
/* Code set 3: Unicode U+10000-U+10FFFF */
|
||||
unsigned int c1 = w - 0x10000;
|
||||
s = (c1 % 10) + 0x30;
|
||||
c1 /= 10;
|
||||
s |= ((c1 % 126) + 0x81) << 8;
|
||||
c1 /= 126;
|
||||
s |= ((c1 % 10) + 0x30) << 16;
|
||||
c1 /= 10;
|
||||
s |= (c1 + 0x90) << 24;
|
||||
}
|
||||
|
||||
if (!s) {
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
|
||||
} else if (s < 0x80) {
|
||||
out = mb_convert_buf_add(out, s);
|
||||
} else if (s > 0xFFFFFF) {
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
|
||||
out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
|
||||
} else {
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
|
||||
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
|
||||
}
|
||||
}
|
||||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
/* Step through a GB18030 string one character at a time. Find the last position at or
|
||||
* before `limit` which falls directly after the end of a (single or multi-byte) character */
|
||||
static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
|
||||
@@ -11673,6 +11986,21 @@ const mbfl_encoding mbfl_encoding_cp936 = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_gb18030_2022 = {
|
||||
mbfl_no_encoding_gb18030_2022,
|
||||
"GB18030-2022",
|
||||
"GB18030-2022",
|
||||
NULL,
|
||||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
NULL,
|
||||
NULL,
|
||||
mb_gb18030_2022_to_wchar,
|
||||
mb_wchar_to_gb18030_2022,
|
||||
NULL,
|
||||
mb_cut_gb18030,
|
||||
};
|
||||
|
||||
/*
|
||||
* BIG5/CP950
|
||||
*/
|
||||
|
||||
@@ -32,6 +32,7 @@ extern const mbfl_encoding mbfl_encoding_euc_kr;
|
||||
extern const mbfl_encoding mbfl_encoding_uhc;
|
||||
|
||||
extern const mbfl_encoding mbfl_encoding_gb18030;
|
||||
extern const mbfl_encoding mbfl_encoding_gb18030_2022;
|
||||
extern const mbfl_encoding mbfl_encoding_cp936;
|
||||
extern const mbfl_encoding mbfl_encoding_big5;
|
||||
extern const mbfl_encoding mbfl_encoding_cp950;
|
||||
|
||||
@@ -92,6 +92,34 @@ static const unsigned short mbfl_gb18030_pua_tbl[][3] = {
|
||||
|
||||
static const int mbfl_gb18030_pua_tbl_max = sizeof(mbfl_gb18030_pua_tbl)/(sizeof(unsigned short)*3);
|
||||
|
||||
static const unsigned short mbfl_gb18030_2022_pua_tbl[][3] = {
|
||||
{0xe766, 0xe76b, 0xa2ab},
|
||||
{0xe76d, 0xe76d, 0xa2e4},
|
||||
{0xe76e, 0xe76f, 0xa2ef},
|
||||
{0xe770, 0xe771, 0xa2fd},
|
||||
{0xe772, 0xe77c, 0xa4f4},
|
||||
{0xe77d, 0xe784, 0xa5f7},
|
||||
{0xe785, 0xe78c, 0xa6b9},
|
||||
{0xe797, 0xe79f, 0xa6f6},
|
||||
{0xe7a0, 0xe7ae, 0xa7c2},
|
||||
{0xe7af, 0xe7bb, 0xa7f2},
|
||||
{0xe7bc, 0xe7c6, 0xa896},
|
||||
{0xe7c9, 0xe7cc, 0xa8c1},
|
||||
{0xe7cd, 0xe7e1, 0xa8ea},
|
||||
{0xe7e2, 0xe7e2, 0xa958},
|
||||
{0xe7e3, 0xe7e3, 0xa95b},
|
||||
{0xe7e4, 0xe7e6, 0xa95d},
|
||||
{0xe7f4, 0xe800, 0xa997},
|
||||
{0xe801, 0xe80f, 0xa9f0},
|
||||
{0xe810, 0xe814, 0xd7fa},
|
||||
{0xe816, 0xe818, 0xfe51},
|
||||
{0xe831, 0xe831, 0xfe6c},
|
||||
{0xe83b, 0xe83b, 0xfe76},
|
||||
{0xe855, 0xe855, 0xfe91}
|
||||
};
|
||||
|
||||
static const int mbfl_gb18030_2022_pua_tbl_max = sizeof(mbfl_gb18030_2022_pua_tbl)/(sizeof(unsigned short)*3);
|
||||
|
||||
static const unsigned short mbfl_gb2uni_tbl[] = {
|
||||
0x0000, 0x0023, 0x0024, 0x0025, 0x0026, 0x002c, 0x002d, 0x0031,
|
||||
0x0032, 0x0050, 0x0051, 0x0058, 0x0059, 0x005e, 0x005f, 0x005f,
|
||||
@@ -233,4 +261,285 @@ static const unsigned short mbfl_gb_uni_ofst[] = {
|
||||
|
||||
static const int mbfl_gb_uni_max = sizeof(mbfl_gb_uni_ofst)/sizeof(unsigned short);
|
||||
|
||||
static const unsigned short mbfl_uni2gb2022_tbl[] = {
|
||||
0x0080, 0x00a3, 0x00a5, 0x00a6, 0x00a9, 0x00af, 0x00b2, 0x00b6,
|
||||
0x00b8, 0x00d6, 0x00d8, 0x00df, 0x00e2, 0x00e7, 0x00eb, 0x00eb,
|
||||
0x00ee, 0x00f1, 0x00f4, 0x00f6, 0x00f8, 0x00f8, 0x00fb, 0x00fb,
|
||||
0x00fd, 0x0100, 0x0102, 0x0112, 0x0114, 0x011a, 0x011c, 0x012a,
|
||||
0x012c, 0x0143, 0x0145, 0x0147, 0x0149, 0x014c, 0x014e, 0x016a,
|
||||
0x016c, 0x01cd, 0x01cf, 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3,
|
||||
0x01d5, 0x01d5, 0x01d7, 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db,
|
||||
0x01dd, 0x01f8, 0x01fa, 0x0250, 0x0252, 0x0260, 0x0262, 0x02c6,
|
||||
0x02c8, 0x02c8, 0x02cc, 0x02d8, 0x02da, 0x0390, 0x03a2, 0x03a2,
|
||||
0x03aa, 0x03b0, 0x03c2, 0x03c2, 0x03ca, 0x0400, 0x0402, 0x040f,
|
||||
0x0450, 0x0450, 0x0452, 0x200f, 0x2011, 0x2012, 0x2017, 0x2017,
|
||||
0x201a, 0x201b, 0x201e, 0x2024, 0x2027, 0x202f, 0x2031, 0x2031,
|
||||
0x2034, 0x2034, 0x2036, 0x203a, 0x203c, 0x20ab, 0x20ad, 0x2102,
|
||||
0x2104, 0x2104, 0x2106, 0x2108, 0x210a, 0x2115, 0x2117, 0x2120,
|
||||
0x2122, 0x215f, 0x216c, 0x216f, 0x217a, 0x218f, 0x2194, 0x2195,
|
||||
0x219a, 0x2207, 0x2209, 0x220e, 0x2210, 0x2210, 0x2212, 0x2214,
|
||||
0x2216, 0x2219, 0x221b, 0x221c, 0x2221, 0x2222, 0x2224, 0x2224,
|
||||
0x2226, 0x2226, 0x222c, 0x222d, 0x222f, 0x2233, 0x2238, 0x223c,
|
||||
0x223e, 0x2247, 0x2249, 0x224b, 0x224d, 0x2251, 0x2253, 0x225f,
|
||||
0x2262, 0x2263, 0x2268, 0x226d, 0x2270, 0x2294, 0x2296, 0x2298,
|
||||
0x229a, 0x22a4, 0x22a6, 0x22be, 0x22c0, 0x2311, 0x2313, 0x245f,
|
||||
0x246a, 0x2473, 0x249c, 0x24ff, 0x254c, 0x254f, 0x2574, 0x2580,
|
||||
0x2590, 0x2592, 0x2596, 0x259f, 0x25a2, 0x25b1, 0x25b4, 0x25bb,
|
||||
0x25be, 0x25c5, 0x25c8, 0x25ca, 0x25cc, 0x25cd, 0x25d0, 0x25e1,
|
||||
0x25e6, 0x2604, 0x2607, 0x2608, 0x260a, 0x263f, 0x2641, 0x2641,
|
||||
0x2643, 0x2e80, 0x2e82, 0x2e83, 0x2e85, 0x2e87, 0x2e89, 0x2e8a,
|
||||
0x2e8d, 0x2e96, 0x2e98, 0x2ea6, 0x2ea8, 0x2ea9, 0x2eab, 0x2ead,
|
||||
0x2eaf, 0x2eb2, 0x2eb4, 0x2eb5, 0x2eb8, 0x2eba, 0x2ebc, 0x2ec9,
|
||||
0x2ecb, 0x2fef, 0x2ffc, 0x2fff, 0x3004, 0x3004, 0x3018, 0x301c,
|
||||
0x301f, 0x3020, 0x302a, 0x303d, 0x303f, 0x3040, 0x3094, 0x309a,
|
||||
0x309f, 0x30a0, 0x30f7, 0x30fb, 0x30ff, 0x3104, 0x312a, 0x321f,
|
||||
0x322a, 0x3230, 0x3232, 0x32a2, 0x32a4, 0x338d, 0x3390, 0x339b,
|
||||
0x339f, 0x33a0, 0x33a2, 0x33c3, 0x33c5, 0x33cd, 0x33cf, 0x33d0,
|
||||
0x33d3, 0x33d4, 0x33d6, 0x3446, 0x3448, 0x3472, 0x3474, 0x359d,
|
||||
0x359f, 0x360d, 0x360f, 0x3619, 0x361b, 0x3917, 0x3919, 0x396d,
|
||||
0x396f, 0x39ce, 0x39d1, 0x39de, 0x39e0, 0x3a72, 0x3a74, 0x3b4d,
|
||||
0x3b4f, 0x3c6d, 0x3c6f, 0x3cdf, 0x3ce1, 0x4055, 0x4057, 0x415e,
|
||||
0x4160, 0x4336, 0x4338, 0x43ab, 0x43ad, 0x43b0, 0x43b2, 0x43dc,
|
||||
0x43de, 0x44d5, 0x44d7, 0x464b, 0x464d, 0x4660, 0x4662, 0x4722,
|
||||
0x4724, 0x4728, 0x472a, 0x477b, 0x477d, 0x478c, 0x478e, 0x4946,
|
||||
0x4948, 0x4979, 0x497b, 0x497c, 0x497e, 0x4981, 0x4984, 0x4984,
|
||||
0x4987, 0x499a, 0x499c, 0x499e, 0x49a0, 0x49b5, 0x49b8, 0x4c76,
|
||||
0x4c78, 0x4c9e, 0x4ca4, 0x4d12, 0x4d1a, 0x4dad, 0x4daf, 0x4dff,
|
||||
0x9fa6, 0xd7ff, 0xe76c, 0xe76c, 0xe78d, 0xe78d, 0xe78e, 0xe78e,
|
||||
0xe78f, 0xe78f, 0xe790, 0xe796, 0xe7c7, 0xe7c7, 0xe7c8, 0xe7c8,
|
||||
0xe7e7, 0xe7f3, 0xe815, 0xe815, 0xe819, 0xe81d, 0xe81e, 0xe81e,
|
||||
0xe81f, 0xe825, 0xe826, 0xe826, 0xe827, 0xe82a, 0xe82b, 0xe82c,
|
||||
0xe82d, 0xe830, 0xe832, 0xe832, 0xe833, 0xe83a, 0xe83c, 0xe842,
|
||||
0xe843, 0xe843, 0xe844, 0xe853, 0xe854, 0xe854, 0xe856, 0xe863,
|
||||
0xe864, 0xe864, 0xe865, 0xf92b, 0xf92d, 0xf978, 0xf97a, 0xf994,
|
||||
0xf996, 0xf9e6, 0xf9e8, 0xf9f0, 0xf9f2, 0xfa0b, 0xfa10, 0xfa10,
|
||||
0xfa12, 0xfa12, 0xfa15, 0xfa17, 0xfa19, 0xfa1e, 0xfa22, 0xfa22,
|
||||
0xfa25, 0xfa26, 0xfa2a, 0xfe2f, 0xfe32, 0xfe32, 0xfe45, 0xfe48,
|
||||
0xfe53, 0xfe53, 0xfe58, 0xfe58, 0xfe67, 0xfe67, 0xfe6c, 0xff00,
|
||||
0xff5f, 0xffdf, 0xffe6, 0xffff,
|
||||
};
|
||||
|
||||
static const unsigned short mbfl_gb2022_uni_ofst[] = {
|
||||
128, 129, 131, 133, 134, 135, 137, 140,
|
||||
142, 144, 145, 147, 148, 149, 150, 151,
|
||||
152, 153, 154, 155, 156, 157, 158, 159,
|
||||
160, 161, 162, 163, 164, 165, 166, 167,
|
||||
168, 171, 172, 189, 196, 213, 220, 221,
|
||||
285, 286, 287, 291, 293, 295, 297, 298,
|
||||
300, 301, 302, 303, 304, 305, 306, 307,
|
||||
308, 320, 330, 334, 338, 339, 340, 341,
|
||||
342, 343, 347, 348, 349, 354, 355, 359,
|
||||
360, 361, 362, 363, 365, 369, 371, 372,
|
||||
373, 374, 375, 376, 386, 426, 502, 538,
|
||||
553, 556, 558, 560, 562, 564, 565, 567,
|
||||
571, 573, 574, 575, 576, 577, 578, 579,
|
||||
581, 582, 583, 584, 585, 586, 588, 589,
|
||||
590, 602, 606, 625, 627, 636, 637, 720,
|
||||
724, 810, 813, 850, 860, 861, 862, 864,
|
||||
867, 868, 869, 870, 872, 873, 874, 875,
|
||||
876, 877, 878, 879, 880, 882, 883, 884,
|
||||
885, 886, 887, 888, 889, 890, 891, 892,
|
||||
893, 894, 895, 896, 897, 898, 899, 900,
|
||||
901, 902, 903, 905, 907, 908, 909, 911,
|
||||
912, 917, 924, 925, 21827, 25775, 20201, 20200,
|
||||
20202, 20201, 51878, 25866, 25896, 25929, 25932, 40365,
|
||||
25933, 40372, 25934, 40376, 25936, 40381, 25938, 25939,
|
||||
40397, 25940, 40413, 25942, 40428, 25943, 25944, 25945,
|
||||
25946, 25947, 25948, 25952, 25953, 25955, 25956, 25959,
|
||||
25961, 25964, 25966, 25984, 25994, 25998, 26012, 26016,
|
||||
26110, 26116,
|
||||
};
|
||||
|
||||
static const int mbfl_gb2022_uni_max = sizeof(mbfl_gb2022_uni_ofst)/sizeof(unsigned short);
|
||||
|
||||
const unsigned short gb18030_2022_pua_tbl1[] = {
|
||||
/* 0xA2AB */
|
||||
0xE766,0xE767,0xE768,0xE769,0xE76A,
|
||||
0xE76B,0x2488,0x2489,0x248a,0x248b,0x248c,0x248d,0x248e,
|
||||
0x248f,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,
|
||||
0x2497,0x2498,0x2499,0x249a,0x249b,0x2474,0x2475,0x2476,
|
||||
0x2477,0x2478,0x2479,0x247a,0x247b,0x247c,0x247d,0x247e,
|
||||
0x247f,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,
|
||||
0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,
|
||||
0x2467,0x2468,0x2469,0xE76C,0xE76D,0x3220,0x3221,0x3222,
|
||||
0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0xE76E,
|
||||
0xE76F,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,
|
||||
0x2167,0x2168,0x2169,0x216a,0x216b,0xE770,0xE771,0x0000,
|
||||
/* 0xA340 */
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0xff01,0xff02,0xff03,0xffe5,0xff05,0xff06,0xff07,
|
||||
0xff08,0xff09,0xff0a,0xff0b,0xff0c,0xff0d,0xff0e,0xff0f,
|
||||
0xff10,0xff11,0xff12,0xff13,0xff14,0xff15,0xff16,0xff17,
|
||||
0xff18,0xff19,0xff1a,0xff1b,0xff1c,0xff1d,0xff1e,0xff1f,
|
||||
0xff20,0xff21,0xff22,0xff23,0xff24,0xff25,0xff26,0xff27,
|
||||
0xff28,0xff29,0xff2a,0xff2b,0xff2c,0xff2d,0xff2e,0xff2f,
|
||||
0xff30,0xff31,0xff32,0xff33,0xff34,0xff35,0xff36,0xff37,
|
||||
0xff38,0xff39,0xff3a,0xff3b,0xff3c,0xff3d,0xff3e,0xff3f,
|
||||
0xff40,0xff41,0xff42,0xff43,0xff44,0xff45,0xff46,0xff47,
|
||||
0xff48,0xff49,0xff4a,0xff4b,0xff4c,0xff4d,0xff4e,0xff4f,
|
||||
0xff50,0xff51,0xff52,0xff53,0xff54,0xff55,0xff56,0xff57,
|
||||
0xff58,0xff59,0xff5a,0xff5b,0xff5c,0xff5d,0xffe3,0x0000,
|
||||
/* 0xA440 */
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,
|
||||
0x3048,0x3049,0x304a,0x304b,0x304c,0x304d,0x304e,0x304f,
|
||||
0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,
|
||||
0x3058,0x3059,0x305a,0x305b,0x305c,0x305d,0x305e,0x305f,
|
||||
0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,
|
||||
0x3068,0x3069,0x306a,0x306b,0x306c,0x306d,0x306e,0x306f,
|
||||
0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,
|
||||
0x3078,0x3079,0x307a,0x307b,0x307c,0x307d,0x307e,0x307f,
|
||||
0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,
|
||||
0x3088,0x3089,0x308a,0x308b,0x308c,0x308d,0x308e,0x308f,
|
||||
0x3090,0x3091,0x3092,0x3093,0xE772,0xE773,0xE774,0xE775,
|
||||
0xE776,0xE777,0xE778,0xE779,0xE77A,0xE77B,0xE77C,0x0000,
|
||||
/* 0xA540 */
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x30a1,0x30a2,0x30a3,0x30a4,0x30a5,0x30a6,0x30a7,
|
||||
0x30a8,0x30a9,0x30aa,0x30ab,0x30ac,0x30ad,0x30ae,0x30af,
|
||||
0x30b0,0x30b1,0x30b2,0x30b3,0x30b4,0x30b5,0x30b6,0x30b7,
|
||||
0x30b8,0x30b9,0x30ba,0x30bb,0x30bc,0x30bd,0x30be,0x30bf,
|
||||
0x30c0,0x30c1,0x30c2,0x30c3,0x30c4,0x30c5,0x30c6,0x30c7,
|
||||
0x30c8,0x30c9,0x30ca,0x30cb,0x30cc,0x30cd,0x30ce,0x30cf,
|
||||
0x30d0,0x30d1,0x30d2,0x30d3,0x30d4,0x30d5,0x30d6,0x30d7,
|
||||
0x30d8,0x30d9,0x30da,0x30db,0x30dc,0x30dd,0x30de,0x30df,
|
||||
0x30e0,0x30e1,0x30e2,0x30e3,0x30e4,0x30e5,0x30e6,0x30e7,
|
||||
0x30e8,0x30e9,0x30ea,0x30eb,0x30ec,0x30ed,0x30ee,0x30ef,
|
||||
0x30f0,0x30f1,0x30f2,0x30f3,0x30f4,0x30f5,0x30f6,0xE77D,
|
||||
0xE77E,0xE77F,0xE780,0xE781,0xE782,0xE783,0xE784,0x0000,
|
||||
/* 0xA640 */
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,
|
||||
0x0398,0x0399,0x039a,0x039b,0x039c,0x039d,0x039e,0x039f,
|
||||
0x03a0,0x03a1,0x03a3,0x03a4,0x03a5,0x03a6,0x03a7,0x03a8,
|
||||
0x03a9,0xE785,0xE786,0xE787,0xE788,0xE789,0xE78A,0xE78B,
|
||||
0xE78C,0x03b1,0x03b2,0x03b3,0x03b4,0x03b5,0x03b6,0x03b7,
|
||||
0x03b8,0x03b9,0x03ba,0x03bb,0x03bc,0x03bd,0x03be,0x03bf,
|
||||
0x03c0,0x03c1,0x03c3,0x03c4,0x03c5,0x03c6,0x03c7,0x03c8,
|
||||
0x03c9,0xFE10,0xFE12,0xFE11,0xFE13,0xFE14,0xFE15,0xFE16,
|
||||
0xfe35,0xfe36,0xfe39,0xfe3a,0xfe3f,0xfe40,0xfe3d,0xfe3e,
|
||||
0xfe41,0xfe42,0xfe43,0xfe44,0xFE17,0xFE18,0xfe3b,0xfe3c,
|
||||
0xfe37,0xfe38,0xfe31,0xFE19,0xfe33,0xfe34,0xE797,0xE798,
|
||||
0xE799,0xE79A,0xE79B,0xE79C,0xE79D,0xE79E,0xE79F,0x0000,
|
||||
/* 0xA740 */
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
|
||||
0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,
|
||||
0x0416,0x0417,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,
|
||||
0x041e,0x041f,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,
|
||||
0x0426,0x0427,0x0428,0x0429,0x042a,0x042b,0x042c,0x042d,
|
||||
0x042e,0x042f,0xE7A0,0xE7A1,0xE7A2,0xE7A3,0xE7A4,0xE7A5,
|
||||
0xE7A6,0xE7A7,0xE7A8,0xE7A9,0xE7AA,0xE7AB,0xE7AC,0xE7AD,
|
||||
0xE7AE,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,
|
||||
0x0436,0x0437,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,
|
||||
0x043e,0x043f,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,
|
||||
0x0446,0x0447,0x0448,0x0449,0x044a,0x044b,0x044c,0x044d,
|
||||
0x044e,0x044f,0xE7AF,0xE7B0,0xE7B1,0xE7B2,0xE7B3,0xE7B4,
|
||||
0xE7B5,0xE7B6,0xE7B7,0xE7B8,0xE7B9,0xE7BA,0xE7BB,0x0000,
|
||||
/* 0xA840 */
|
||||
0x02ca,0x02cb,0x02d9,0x2013,0x2015,0x2025,0x2035,0x2105,
|
||||
0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221f,0x2223,
|
||||
0x2252,0x2266,0x2267,0x22bf,0x2550,0x2551,0x2552,0x2553,
|
||||
0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,
|
||||
0x255c,0x255d,0x255e,0x255f,0x2560,0x2561,0x2562,0x2563,
|
||||
0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,
|
||||
0x256c,0x256d,0x256e,0x256f,0x2570,0x2571,0x2572,0x2573,
|
||||
0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,
|
||||
0x2588,0x2589,0x258a,0x258b,0x258c,0x258d,0x258e,0x258f,
|
||||
0x2593,0x2594,0x2595,0x25bc,0x25bd,0x25e2,0x25e3,0x25e4,
|
||||
0x25e5,0x2609,0x2295,0x3012,0x301d,0x301e,0xE7BC,0xE7BD,
|
||||
0xE7BE,0xE7BF,0xE7C0,0xE7C1,0xE7C2,0xE7C3,0xE7C4,0xE7C5,
|
||||
0xE7C6,0x0101,0x00e1,0x01ce,0x00e0,0x0113,0x00e9,0x011b,
|
||||
0x00e8,0x012b,0x00ed,0x01d0,0x00ec,0x014d,0x00f3,0x01d2,
|
||||
0x00f2,0x016b,0x00fa,0x01d4,0x00f9,0x01d6,0x01d8,0x01da,
|
||||
0x01dc,0x00fc,0x00ea,0x0251,0x1E3F,0x0144,0x0148,0x01F9,
|
||||
0x0261,0xE7C9,0xE7CA,0xE7CB,0xE7CC,0x3105,0x3106,0x3107,
|
||||
0x3108,0x3109,0x310a,0x310b,0x310c,0x310d,0x310e,0x310f,
|
||||
0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,
|
||||
0x3118,0x3119,0x311a,0x311b,0x311c,0x311d,0x311e,0x311f,
|
||||
0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,
|
||||
0x3128,0x3129,0xE7CD,0xE7CE,0xE7CF,0xE7D0,0xE7D1,0xE7D2,
|
||||
0xE7D3,0xE7D4,0xE7D5,0xE7D6,0xE7D7,0xE7D8,0xE7D9,0xE7DA,
|
||||
0xE7DB,0xE7DC,0xE7DD,0xE7DE,0xE7DF,0xE7E0,0xE7E1,0x0000,
|
||||
/* 0xA940 */
|
||||
0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,
|
||||
0x3029,0x32a3,0x338e,0x338f,0x339c,0x339d,0x339e,0x33a1,
|
||||
0x33c4,0x33ce,0x33d1,0x33d2,0x33d5,0xfe30,0xffe2,0xffe4,
|
||||
0xE7E2,0x2121,0x3231,0xE7E3,0x2010,0xE7E4,0xE7E5,0xE7E6,
|
||||
0x30fc,0x309b,0x309c,0x30fd,0x30fe,0x3006,0x309d,0x309e,
|
||||
0xfe49,0xfe4a,0xfe4b,0xfe4c,0xfe4d,0xfe4e,0xfe4f,0xfe50,
|
||||
0xfe51,0xfe52,0xfe54,0xfe55,0xfe56,0xfe57,0xfe59,0xfe5a,
|
||||
0xfe5b,0xfe5c,0xfe5d,0xfe5e,0xfe5f,0xfe60,0xfe61,0x0000,
|
||||
0xfe62,0xfe63,0xfe64,0xfe65,0xfe66,0xfe68,0xfe69,0xfe6a,
|
||||
0xfe6b,0x303E,0x2FF0,0x2FF1,0x2FF2,0x2FF3,0x2FF4,0x2FF5,
|
||||
0x2FF6,0x2FF7,0x2FF8,0x2FF9,0x2FFA,0x2FFB,0x3007,0xE7F4,
|
||||
0xE7F5,0xE7F6,0xE7F7,0xE7F8,0xE7F9,0xE7FA,0xE7FB,0xE7FC,
|
||||
0xE7FD,0xE7FE,0xE7FF,0xE800,0x2500,0x2501,0x2502,0x2503,
|
||||
0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250a,0x250b,
|
||||
0x250c,0x250d,0x250e,0x250f,0x2510,0x2511,0x2512,0x2513,
|
||||
0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251a,0x251b,
|
||||
0x251c,0x251d,0x251e,0x251f,0x2520,0x2521,0x2522,0x2523,
|
||||
0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252a,0x252b,
|
||||
0x252c,0x252d,0x252e,0x252f,0x2530,0x2531,0x2532,0x2533,
|
||||
0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253a,0x253b,
|
||||
0x253c,0x253d,0x253e,0x253f,0x2540,0x2541,0x2542,0x2543,
|
||||
0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254a,0x254b,
|
||||
0xE801,0xE802,0xE803,0xE804,0xE805,0xE806,0xE807,0xE808,
|
||||
0xE809,0xE80A,0xE80B,0xE80C,0xE80D,0xE80E,0xE80F,0x0000,
|
||||
};
|
||||
|
||||
#endif /* UNICODE_TABLE_GB18030_H */
|
||||
|
||||
@@ -88,10 +88,10 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
|
||||
&mbfl_encoding_sjis,
|
||||
&mbfl_encoding_eucjp_win,
|
||||
&mbfl_encoding_eucjp2004,
|
||||
&mbfl_encoding_sjis_docomo,
|
||||
&mbfl_encoding_sjis_kddi,
|
||||
&mbfl_encoding_sjis_sb,
|
||||
&mbfl_encoding_sjis_mac,
|
||||
&mbfl_encoding_sjis_docomo,
|
||||
&mbfl_encoding_sjis_kddi,
|
||||
&mbfl_encoding_sjis_sb,
|
||||
&mbfl_encoding_sjis_mac,
|
||||
&mbfl_encoding_sjis2004,
|
||||
&mbfl_encoding_utf8_docomo,
|
||||
&mbfl_encoding_utf8_kddi_a,
|
||||
@@ -104,6 +104,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
|
||||
&mbfl_encoding_2022jp,
|
||||
&mbfl_encoding_2022jpms,
|
||||
&mbfl_encoding_gb18030,
|
||||
&mbfl_encoding_gb18030_2022,
|
||||
&mbfl_encoding_cp1252,
|
||||
&mbfl_encoding_cp1254,
|
||||
&mbfl_encoding_8859_1,
|
||||
@@ -148,145 +149,176 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
|
||||
* Command used: gperf encodings.txt --readonly-tables --null-strings --ignore-case
|
||||
* The encodings.txt contains all the contents of the name fields of the mbfl_encoding_ptr_list table. */
|
||||
|
||||
static const int8_t mbfl_encoding_ptr_list_after_hashing[187] = {
|
||||
-1, -1, -1,
|
||||
65,
|
||||
static const int8_t mbfl_encoding_ptr_list_after_hashing[231] = {
|
||||
-1, -1,
|
||||
61,
|
||||
66,
|
||||
23,
|
||||
73,
|
||||
59,
|
||||
-1,
|
||||
1,
|
||||
-1, -1, -1,
|
||||
11,
|
||||
-1,
|
||||
5,
|
||||
9,
|
||||
-1,
|
||||
60,
|
||||
36,
|
||||
-1, -1,
|
||||
58,
|
||||
42,
|
||||
-1, -1,
|
||||
18,
|
||||
27,
|
||||
77,
|
||||
26,
|
||||
40,
|
||||
72,
|
||||
12,
|
||||
10,
|
||||
2,
|
||||
31,
|
||||
-1, -1,
|
||||
75,
|
||||
74,
|
||||
33,
|
||||
45,
|
||||
-1,
|
||||
67,
|
||||
13,
|
||||
-1,
|
||||
51,
|
||||
53,
|
||||
11,
|
||||
1,
|
||||
-1,
|
||||
48,
|
||||
56,
|
||||
-1,
|
||||
38,
|
||||
20,
|
||||
46,
|
||||
-1,
|
||||
52,
|
||||
54,
|
||||
-1,
|
||||
14,
|
||||
24,
|
||||
44,
|
||||
39,
|
||||
43,
|
||||
2,
|
||||
40,
|
||||
46,
|
||||
27,
|
||||
76,
|
||||
26,
|
||||
-1,
|
||||
30,
|
||||
49,
|
||||
57,
|
||||
76,
|
||||
-1, -1,
|
||||
68,
|
||||
73,
|
||||
7,
|
||||
16,
|
||||
-1,
|
||||
35,
|
||||
66,
|
||||
-1, -1, -1,
|
||||
75,
|
||||
-1,
|
||||
47,
|
||||
55,
|
||||
-1, -1, -1,
|
||||
63,
|
||||
15,
|
||||
78,
|
||||
36,
|
||||
-1,
|
||||
50,
|
||||
58,
|
||||
8,
|
||||
17,
|
||||
-1,
|
||||
21,
|
||||
70,
|
||||
-1,
|
||||
29,
|
||||
5,
|
||||
-1, -1,
|
||||
69,
|
||||
39,
|
||||
7,
|
||||
-1, -1,
|
||||
64,
|
||||
67,
|
||||
-1, -1,
|
||||
30,
|
||||
48,
|
||||
56,
|
||||
-1, -1, -1,
|
||||
35,
|
||||
74,
|
||||
-1, -1,
|
||||
24,
|
||||
53,
|
||||
62,
|
||||
43,
|
||||
-1, -1,
|
||||
45,
|
||||
22,
|
||||
-1, -1, -1,
|
||||
6,
|
||||
61,
|
||||
-1, -1,
|
||||
71,
|
||||
52,
|
||||
3,
|
||||
-1, -1, -1,
|
||||
18,
|
||||
71,
|
||||
-1, -1, -1,
|
||||
21,
|
||||
-1,
|
||||
37,
|
||||
-1,
|
||||
4,
|
||||
60,
|
||||
25,
|
||||
-1, -1,
|
||||
72,
|
||||
51,
|
||||
-1,
|
||||
44,
|
||||
29,
|
||||
-1,
|
||||
28,
|
||||
0,
|
||||
-1,
|
||||
14,
|
||||
31,
|
||||
63,
|
||||
12,
|
||||
-1,
|
||||
13,
|
||||
33,
|
||||
-1, -1,
|
||||
68,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1,
|
||||
20,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1,
|
||||
77,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
65,
|
||||
-1, -1, -1, -1,
|
||||
70,
|
||||
-1, -1, -1, -1,
|
||||
-1,
|
||||
41,
|
||||
-1, -1, -1, -1,
|
||||
-1,
|
||||
17,
|
||||
-1, -1, -1,
|
||||
42,
|
||||
16,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
15,
|
||||
-1, -1, -1, -1,
|
||||
34,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1,
|
||||
32,
|
||||
50,
|
||||
34,
|
||||
-1, -1, -1,
|
||||
62,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
59,
|
||||
0,
|
||||
-1, -1, -1, -1,
|
||||
22,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
25,
|
||||
41,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
19,
|
||||
-1, -1, -1,
|
||||
4,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
69,
|
||||
-1, -1, -1, -1,
|
||||
64,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1,
|
||||
19
|
||||
};
|
||||
|
||||
static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
|
||||
{
|
||||
static const unsigned char asso_values[] =
|
||||
{
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 0, 187, 187, 5, 20,
|
||||
0, 15, 40, 10, 25, 70, 5, 60, 187, 187,
|
||||
187, 187, 187, 187, 187, 75, 5, 0, 20, 5,
|
||||
0, 75, 5, 0, 40, 75, 20, 0, 0, 0,
|
||||
35, 45, 50, 0, 75, 0, 187, 0, 187, 187,
|
||||
0, 187, 187, 187, 187, 187, 187, 75, 5, 0,
|
||||
20, 5, 0, 75, 5, 0, 40, 75, 20, 0,
|
||||
0, 0, 35, 45, 50, 0, 75, 0, 187, 0,
|
||||
187, 187, 0, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 5, 231, 231, 0, 50,
|
||||
5, 15, 35, 10, 20, 75, 0, 45, 231, 231,
|
||||
231, 231, 231, 231, 231, 80, 5, 0, 0, 0,
|
||||
75, 75, 0, 0, 15, 70, 0, 5, 0, 0,
|
||||
25, 55, 30, 0, 10, 0, 231, 25, 231, 231,
|
||||
0, 231, 231, 231, 231, 231, 231, 80, 5, 0,
|
||||
0, 0, 75, 75, 0, 0, 15, 70, 0, 5,
|
||||
0, 0, 25, 55, 30, 0, 10, 0, 231, 25,
|
||||
231, 231, 0, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231, 231, 231, 231, 231,
|
||||
231, 231, 231, 231, 231, 231
|
||||
};
|
||||
unsigned int hval = len;
|
||||
|
||||
@@ -345,7 +377,7 @@ const mbfl_encoding *mbfl_name2encoding_ex(const char *name, size_t name_len)
|
||||
/* Use perfect hash lookup for name */
|
||||
if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
|
||||
unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
|
||||
if (key <= 186) {
|
||||
if (key <= sizeof(mbfl_encoding_ptr_list_after_hashing)) {
|
||||
int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
|
||||
if (offset >= 0) {
|
||||
encoding = mbfl_encoding_ptr_list + offset;
|
||||
|
||||
@@ -84,6 +84,7 @@ enum mbfl_no_encoding {
|
||||
mbfl_no_encoding_2022jp_kddi,
|
||||
mbfl_no_encoding_2022jpms,
|
||||
mbfl_no_encoding_gb18030,
|
||||
mbfl_no_encoding_gb18030_2022,
|
||||
mbfl_no_encoding_cp1252,
|
||||
mbfl_no_encoding_cp1254,
|
||||
mbfl_no_encoding_8859_1,
|
||||
|
||||
63488
ext/mbstring/tests/data/GB18030-2022MappingTableBMP.txt
Normal file
63488
ext/mbstring/tests/data/GB18030-2022MappingTableBMP.txt
Normal file
File diff suppressed because it is too large
Load Diff
310
ext/mbstring/tests/gb18030_2022_encoding.phpt
Normal file
310
ext/mbstring/tests/gb18030_2022_encoding.phpt
Normal file
@@ -0,0 +1,310 @@
|
||||
--TEST--
|
||||
Exhaustive test of verification and conversion of GB18030-2022 text
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--SKIPIF--
|
||||
<?php
|
||||
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
?>
|
||||
--FILE--
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
srand(2323); // Make results consistent
|
||||
mb_substitute_character(0x25); // '%'
|
||||
|
||||
$updatedMappings = [
|
||||
"\xA6\xD9" => "\xFE\x10",
|
||||
"\xA6\xDA" => "\xFE\x12",
|
||||
"\xA6\xDB" => "\xFE\x11",
|
||||
"\xA6\xDC" => "\xFE\x13",
|
||||
"\xA6\xDD" => "\xFE\x14",
|
||||
"\xA6\xDE" => "\xFE\x15",
|
||||
"\xA6\xDF" => "\xFE\x16",
|
||||
"\xA6\xEC" => "\xFE\x17",
|
||||
"\xA6\xED" => "\xFE\x18",
|
||||
"\xA6\xF3" => "\xFE\x19",
|
||||
|
||||
"\xA8\xBC" => "\x1E\x3F",
|
||||
"\xA8\xBF" => "\x01\xF9",
|
||||
"\xA9\x89" => "\x30\x3E",
|
||||
"\xA9\x8A" => "\x2F\xF0",
|
||||
"\xA9\x8B" => "\x2F\xF1",
|
||||
"\xA9\x8C" => "\x2F\xF2",
|
||||
"\xA9\x8D" => "\x2F\xF3",
|
||||
"\xA9\x8E" => "\x2F\xF4",
|
||||
"\xA9\x8F" => "\x2F\xF5",
|
||||
"\xA9\x90" => "\x2F\xF6",
|
||||
"\xA9\x91" => "\x2F\xF7",
|
||||
"\xA9\x92" => "\x2F\xF8",
|
||||
"\xA9\x93" => "\x2F\xF9",
|
||||
"\xA9\x94" => "\x2F\xFA",
|
||||
"\xA9\x95" => "\x2F\xFB",
|
||||
|
||||
"\xFE\x50" => "\x2E\x81",
|
||||
"\xFE\x51" => "\xE8\x16",
|
||||
"\xFE\x52" => "\xE8\x17",
|
||||
"\xFE\x53" => "\xE8\x18",
|
||||
"\xFE\x54" => "\x2E\x84",
|
||||
"\xFE\x55" => "\x34\x73",
|
||||
"\xFE\x56" => "\x34\x47",
|
||||
"\xFE\x57" => "\x2E\x88",
|
||||
"\xFE\x58" => "\x2E\x8B",
|
||||
"\xFE\x59" => "\x9F\xB4",
|
||||
"\xFE\x5A" => "\x35\x9E",
|
||||
"\xFE\x5B" => "\x36\x1A",
|
||||
"\xFE\x5C" => "\x36\x0E",
|
||||
"\xFE\x5D" => "\x2E\x8C",
|
||||
"\xFE\x5E" => "\x2E\x97",
|
||||
"\xFE\x5F" => "\x39\x6E",
|
||||
|
||||
"\xFE\x60" => "\x39\x18",
|
||||
"\xFE\x61" => "\x9F\xB5",
|
||||
"\xFE\x62" => "\x39\xCF",
|
||||
"\xFE\x63" => "\x39\xDF",
|
||||
"\xFE\x64" => "\x3A\x73",
|
||||
"\xFE\x65" => "\x39\xD0",
|
||||
"\xFE\x66" => "\x9F\xB6",
|
||||
"\xFE\x67" => "\x9F\xB7",
|
||||
"\xFE\x68" => "\x3B\x4E",
|
||||
"\xFE\x69" => "\x3C\x6E",
|
||||
"\xFE\x6A" => "\x3C\xE0",
|
||||
"\xFE\x6B" => "\x2E\xA7",
|
||||
"\xFE\x6C" => "\xE8\x31",
|
||||
"\xFE\x6D" => "\x9F\xB8",
|
||||
"\xFE\x6E" => "\x2E\xAA",
|
||||
"\xFE\x6F" => "\x40\x56",
|
||||
|
||||
"\xFE\x76" => "\xE8\x3B",
|
||||
"\xFE\x7E" => "\x9F\xB9",
|
||||
"\xFE\x90" => "\x9F\xBA",
|
||||
"\xFE\x91" => "\xE8\x55",
|
||||
"\xFE\xA0" => "\x9F\xBB"];
|
||||
testAllValidChars($updatedMappings, 'GB18030-2022', 'UTF-16BE', false);
|
||||
testAllValidChars(array_flip($updatedMappings), 'UTF-16BE', 'GB18030-2022', false);
|
||||
|
||||
$sampleSMP = [
|
||||
"\x00\x10\x03\x08" => "\xDE\x30\xE6\x36",
|
||||
"\x00\x10\x14\xEB" => "\xDE\x34\xB8\x35",
|
||||
"\x00\x10\x29\x76" => "\xDE\x38\xCE\x34",
|
||||
"\x00\x10\x40\x6E" => "\xDF\x33\xA4\x34",
|
||||
"\x00\x10\x78\x7B" => "\xE0\x34\xD5\x33",
|
||||
"\x00\x01\x25\x2A" => "\x90\x37\xC6\x34",
|
||||
"\x00\x01\x5B\xA4" => "\x91\x38\xCF\x30",
|
||||
"\x00\x01\x6D\x81" => "\x92\x32\xA0\x33",
|
||||
"\x00\x01\x7F\xB2" => "\x92\x35\xF8\x30",
|
||||
"\x00\x01\x89\x9B" => "\x92\x37\xF9\x37",
|
||||
"\x00\x01\x9E\x77" => "\x93\x32\x99\x37",
|
||||
"\x00\x02\x08\x9A" => "\x95\x33\xE0\x38",
|
||||
"\x00\x02\x1B\x00" => "\x95\x37\xBF\x38",
|
||||
"\x00\x02\x31\xBE" => "\x96\x32\x90\x30",
|
||||
"\x00\x02\x64\xD4" => "\x97\x32\xBF\x38",
|
||||
"\x00\x02\xA9\xA0" => "\x98\x36\xBD\x30",
|
||||
"\x00\x02\xBA\x38" => "\x98\x39\xEB\x38",
|
||||
"\x00\x03\x1C\x13" => "\x9A\x39\xDC\x39",
|
||||
"\x00\x03\x20\x6D" => "\x9B\x30\xCE\x33",
|
||||
"\x00\x03\x22\xA9" => "\x9B\x31\x89\x35",
|
||||
"\x00\x03\x39\xB3" => "\x9B\x35\xDF\x33",
|
||||
"\x00\x03\xA7\xF2" => "\x9D\x38\x93\x36",
|
||||
"\x00\x03\xDF\xFB" => "\x9E\x39\xC4\x31",
|
||||
"\x00\x04\x01\x69" => "\x9F\x36\xA9\x39",
|
||||
"\x00\x04\x23\x79" => "\xA0\x33\x9F\x39",
|
||||
"\x00\x04\x26\x52" => "\xA0\x33\xE8\x38",
|
||||
"\x00\x04\x38\xDB" => "\xA0\x37\xCB\x33",
|
||||
"\x00\x04\x46\x84" => "\xA1\x30\xAF\x30",
|
||||
"\x00\x04\x6C\x7C" => "\xA1\x38\x8B\x30",
|
||||
"\x00\x04\x78\x41" => "\xA2\x30\xBC\x33",
|
||||
"\x00\x04\x97\x32" => "\xA2\x36\xE0\x34",
|
||||
"\x00\x04\x9E\xCC" => "\xA2\x38\xA7\x30",
|
||||
"\x00\x04\xC5\xDB" => "\xA3\x36\x9E\x39",
|
||||
"\x00\x04\xF4\xE2" => "\xA4\x35\xE4\x38",
|
||||
"\x00\x05\x3B\xA6" => "\xA6\x30\x96\x34",
|
||||
"\x00\x05\x76\x53" => "\xA7\x32\x8C\x35",
|
||||
"\x00\x05\xEA\x9F" => "\xA9\x35\xDB\x37",
|
||||
"\x00\x06\x12\x29" => "\xAA\x33\xDF\x39",
|
||||
"\x00\x06\x1B\x9E" => "\xAA\x35\xD6\x30",
|
||||
"\x00\x06\x3B\x26" => "\xAB\x32\x8B\x32",
|
||||
"\x00\x06\x4C\xA8" => "\xAB\x35\xD1\x34",
|
||||
"\x00\x06\x63\x3E" => "\xAC\x30\x9D\x36",
|
||||
"\x00\x06\xB3\xA1" => "\xAD\x36\xC7\x35",
|
||||
"\x00\x07\x0A\x31" => "\xAF\x34\x93\x35",
|
||||
"\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
|
||||
"\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
|
||||
"\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
|
||||
"\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
|
||||
"\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
|
||||
"\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
|
||||
"\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
|
||||
"\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
|
||||
"\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
|
||||
"\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
|
||||
"\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
|
||||
"\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
|
||||
"\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
|
||||
"\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
|
||||
"\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
|
||||
"\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
|
||||
"\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
|
||||
"\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
|
||||
"\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
|
||||
"\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
|
||||
"\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
|
||||
"\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
|
||||
"\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
|
||||
"\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
|
||||
"\x00\x09\xBA\x2B" => "\xBD\x33\xF5\x37",
|
||||
"\x00\x0A\x26\x00" => "\xBF\x35\xEA\x32",
|
||||
"\x00\x0A\x36\xE9" => "\xBF\x39\xA3\x31",
|
||||
"\x00\x0A\x7A\x20" => "\xC1\x32\xF5\x38",
|
||||
"\x00\x0A\x9C\x93" => "\xC1\x39\xF5\x37",
|
||||
"\x00\x0A\xC0\xD7" => "\xC2\x37\xA6\x31",
|
||||
"\x00\x0A\xD8\x77" => "\xC3\x32\x8C\x39",
|
||||
"\x00\x0B\x1A\x9B" => "\xC4\x35\xC4\x31",
|
||||
"\x00\x0B\x4F\x27" => "\xC5\x36\x9B\x33",
|
||||
"\x00\x0B\x72\x6D" => "\xC6\x33\xB0\x33",
|
||||
"\x00\x0B\xEE\x23" => "\xC8\x38\xC1\x33",
|
||||
"\x00\x0B\xF0\xDF" => "\xC8\x39\x89\x33",
|
||||
"\x00\x0C\x0B\xE1" => "\xC9\x34\xC6\x37",
|
||||
"\x00\x0C\x4C\x98" => "\xCA\x37\xD9\x34",
|
||||
"\x00\x0C\x5F\x41" => "\xCB\x31\xBF\x31",
|
||||
"\x00\x0C\x63\xE4" => "\xCB\x32\xB7\x38",
|
||||
"\x00\x0C\x70\x0A" => "\xCB\x34\xF2\x38",
|
||||
"\x00\x0C\xAD\x6A" => "\xCC\x37\xB0\x30",
|
||||
"\x00\x0C\xCC\x03" => "\xCD\x33\xCB\x33",
|
||||
"\x00\x0C\xD5\x4C" => "\xCD\x35\xBD\x30",
|
||||
"\x00\x0C\xE6\x70" => "\xCD\x38\xF9\x38",
|
||||
"\x00\x0D\x1B\x6A" => "\xCE\x39\xDC\x30",
|
||||
"\x00\x0D\x55\xEE" => "\xD0\x31\xCE\x30",
|
||||
"\x00\x0D\xBB\xB1" => "\xD2\x32\xA5\x31",
|
||||
"\x00\x0D\xC0\x4F" => "\xD2\x33\x9D\x33",
|
||||
"\x00\x0D\xFA\x84" => "\xD3\x35\x87\x34",
|
||||
"\x00\x0E\x16\x71" => "\xD4\x30\xDC\x33",
|
||||
"\x00\x0E\x1E\x03" => "\xD4\x32\xA2\x31",
|
||||
"\x00\x0E\x20\xE8" => "\xD4\x32\xEC\x32",
|
||||
"\x00\x0E\x39\x6A" => "\xD4\x37\xE9\x36",
|
||||
"\x00\x0E\x6A\x95" => "\xD5\x37\xE8\x33",
|
||||
"\x00\x0E\x7E\xCD" => "\xD6\x31\xF5\x39",
|
||||
"\x00\x0E\x80\x69" => "\xD6\x32\xA1\x31",
|
||||
"\x00\x0E\x9A\x7F" => "\xD6\x37\xC6\x39",
|
||||
"\x00\x0E\xEE\x12" => "\xD8\x34\xC4\x34",
|
||||
"\x00\x0E\xFC\xA1" => "\xD8\x37\xBF\x31",
|
||||
"\x00\x0F\x29\xB0" => "\xD9\x36\xD2\x36",
|
||||
"\x00\x0F\x2A\x12" => "\xD9\x36\xDC\x34",
|
||||
"\x00\x0F\x6C\x8C" => "\xDB\x30\x9E\x32",
|
||||
"\x00\x0F\xAF\x04" => "\xDC\x33\xDD\x38",
|
||||
"\x00\x0F\xBE\x65" => "\xDC\x36\xED\x35",
|
||||
"\x00\x0F\xE5\x88" => "\xDD\x34\xE7\x34",
|
||||
"\x00\x0F\xE7\xB1" => "\xDD\x35\xA0\x37",
|
||||
"\x00\x0F\xF4\x27" => "\xDD\x37\xE3\x37"];
|
||||
testAllValidChars($sampleSMP, 'UTF-32BE', 'GB18030-2022', false);
|
||||
|
||||
function readGB18030_2022_ConversionTable($path, &$from, &$to, $utf32 = false) {
|
||||
$from = [];
|
||||
$to = [];
|
||||
|
||||
$fp = fopen($path, 'r+');
|
||||
while ($line = fgets($fp, 256)) {
|
||||
if ($line[0] == '#')
|
||||
continue;
|
||||
if (sscanf($line, "%x\t%x", $codepoint, $char) == 2) {
|
||||
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
|
||||
if ($char == PHP_INT_MAX) {
|
||||
// We may be on a 32-bit machine and testing a text encoding with 4-byte codes
|
||||
// (which can't be represented in a PHP integer)
|
||||
$char = "";
|
||||
for ($i = 2; $i < strlen($line); $i += 2) {
|
||||
$substr = substr($line, $i, 2);
|
||||
if (ctype_xdigit($substr))
|
||||
$char .= chr(hexdec($substr));
|
||||
else
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if ($char <= 0xFF)
|
||||
$char = chr($char); // hex codes must not have leading zero bytes
|
||||
else if ($char <= 0xFFFF)
|
||||
$char = pack('n', $char);
|
||||
else if ($char <= 0xFFFFFF)
|
||||
$char = chr($char >> 16) . pack('n', $char & 0xFFFF);
|
||||
else
|
||||
$char = pack('N', $char);
|
||||
}
|
||||
$from[$char] = $codepoint;
|
||||
$to[$codepoint] = $char;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
readGB18030_2022_ConversionTable(__DIR__ . '/data/GB18030-2022MappingTableBMP.txt', $toUnicode, $fromUnicode);
|
||||
|
||||
// We will test 4-byte codes separately
|
||||
findInvalidChars($toUnicode, $invalid, $truncated);
|
||||
|
||||
function notFourByteCode($gb) {
|
||||
return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) ||
|
||||
(strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39));
|
||||
}
|
||||
|
||||
$invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
|
||||
$truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
|
||||
|
||||
testAllValidChars($toUnicode, 'GB18030-2022', 'UTF-16BE', false);
|
||||
testAllInvalidChars($invalid, $toUnicode, 'GB18030-2022', 'UTF-16BE', "\x00%");
|
||||
testTruncatedChars($truncated, 'GB18030-2022', 'UTF-16BE', "\x00%");
|
||||
|
||||
echo "Tested GB18030-2022 (BMP) -> UTF-16BE\n";
|
||||
|
||||
// Test one random 4-byte code for each range used for Unicode codepoints in BMP
|
||||
function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) {
|
||||
return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30);
|
||||
}
|
||||
|
||||
function fourByteCodeFromIndex($index) {
|
||||
$quotient = intdiv($index, 10 * 126 * 10);
|
||||
$byte4 = $quotient + 0x81;
|
||||
$index -= ($quotient * 10 * 126 * 10);
|
||||
$quotient = intdiv($index, 10 * 126);
|
||||
$byte3 = $quotient + 0x30;
|
||||
$index -= ($quotient * 10 * 126);
|
||||
$quotient = intdiv($index, 10);
|
||||
$byte2 = $quotient + 0x81;
|
||||
$byte1 = $index - ($quotient * 10) + 0x30;
|
||||
return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1);
|
||||
}
|
||||
|
||||
// Invalid 4-byte codes in range for BMP
|
||||
testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
|
||||
// Valid 4-byte codes for other Unicode planes
|
||||
testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030-2022", "UTF-32BE");
|
||||
testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030-2022", "UTF-32BE");
|
||||
|
||||
// Invalid 4-byte codes for other Unicode planes
|
||||
testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
|
||||
|
||||
testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030-2022", "UTF-32BE");
|
||||
|
||||
echo "Tested GB18030-2022 (SMP) <-> UTF-32BE\n";
|
||||
|
||||
testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030-2022', false);
|
||||
echo "Tested UTF-16BE -> GB18030-2022 (BMP)\n";
|
||||
|
||||
convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030-2022");
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030-2022", "UTF-8");
|
||||
convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030-2022", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested GB18030-2022 (BMP) -> UTF-16BE
|
||||
Tested GB18030-2022 (SMP) <-> UTF-32BE
|
||||
Tested UTF-16BE -> GB18030-2022 (BMP)
|
||||
Done!
|
||||
Reference in New Issue
Block a user