1
0
mirror of https://github.com/php/php-src.git synced 2026-04-02 05:32:28 +02:00

Fix mbstring support for CP1254 encoding

One funny thing: while the original author used Unicode 0xFFFD (generic
replacement character) for invalid bytes in CP1251 and CP1252, for CP1254
they used 0xFFFE, which is not a valid Unicode codepoint at all, but is a
reversed byte-order mark. Probably this was by mistake.

Anyways,

- Fixed identify filter, which was completely wrong.
- Don't convert Unicode 0xFFFE to a random (but valid) CP1254 byte.
- When converting CP1254 to CP1254, don't pass invalid bytes through silently.
This commit is contained in:
Alex Dowad
2020-10-18 15:30:03 +02:00
parent eb4151e89e
commit e6d17cfe44
3 changed files with 19 additions and 55 deletions

View File

@@ -74,79 +74,44 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp1254 = {
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
/*
* wchar => cp1254
*/
int
mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter)
int mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter)
{
int s, n;
if (c < 0x80) {
s = c;
CK((*filter->output_function)(c, filter->data));
} else {
s = -1;
n = cp1254_ucs_table_len-1;
while (n >= 0) {
if (c == cp1254_ucs_table[n] && c != 0xfffe) {
s = cp1254_ucs_table_min + n;
break;
for (int n = 0; n < cp1254_ucs_table_len; n++) {
if (c == cp1254_ucs_table[n]) {
CK((*filter->output_function)(cp1254_ucs_table_min + n, filter->data));
return c;
}
n--;
}
if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_CP1254) {
s = c & MBFL_WCSPLANE_MASK;
}
}
if (s >= 0) {
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return c;
}
/*
* cp1254 => wchar
*/
int
mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter)
int mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter)
{
int s;
if (c >= 0 && c < cp1254_ucs_table_min) {
if (c < cp1254_ucs_table_min) {
s = c;
} else if (c >= cp1254_ucs_table_min && c < 0x100) {
s = cp1254_ucs_table[c - cp1254_ucs_table_min];
if (s <= 0) {
s = c;
s &= MBFL_WCSPLANE_MASK;
s |= MBFL_WCSPLANE_CP1254;
}
} else {
s = c;
s &= MBFL_WCSGROUP_MASK;
s |= MBFL_WCSGROUP_THROUGH;
s = cp1254_ucs_table[c - cp1254_ucs_table_min];
if (!s) {
s = c | MBFL_WCSGROUP_THROUGH;
}
}
CK((*filter->output_function)(s, filter->data));
return c;
}
/* We only distinguish the MS extensions to ISO-8859-1.
* Actually, this is pretty much a NO-OP, since the identification
* system doesn't allow us to discriminate between a positive match,
* a possible match and a definite non-match.
* The problem here is that cp1254 looks like SJIS for certain chars.
* */
static int mbfl_filt_ident_cp1254(int c, mbfl_identify_filter *filter)
{
if (c >= 0x80 && c < 0xff)
filter->flag = 0;
else
filter->flag = 1; /* not it */
if (c >= 0x81 && c <= 0x9E && !cp1254_ucs_table[c - cp1254_ucs_table_min]) {
filter->flag = 1;
}
return c;
}

View File

@@ -27,10 +27,10 @@
/* cp1254 to Unicode table */
static const unsigned short cp1254_ucs_table[] = {
0x20ac, 0xfffe, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0xfffe, 0xfffe, 0xfffe,
0xfffe, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0xfffe, 0xfffe, 0x0178,
0x20ac, 0x0000, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x0000, 0x0000,
0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x0000, 0x0000, 0x0178,
0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,

View File

@@ -75,7 +75,6 @@
#define MBFL_WCSPLANE_8859_16 0x70fa0000 /* 00h - FFh */
#define MBFL_WCSPLANE_ARMSCII8 0x70fb0000
#define MBFL_WCSPLANE_KOI8U 0x70fc0000
#define MBFL_WCSPLANE_CP1254 0x70fd0000 /* 00h - FFh */
#define MBFL_WCSPLANE_CP850 0x70fe0000 /* 00h - FFh */
#define MBFL_WCSPLANE_GB18030 0x70ff0000 /* a1a1h-e3329a35h */
#define MBFL_WCSGROUP_MASK 0xffffff