From e6d17cfe44f4fcecf201dd0bdfad1d7705fc919f Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 18 Oct 2020 15:30:03 +0200 Subject: [PATCH] Fix mbstring support for CP1254 encoding One funny thing: while the original author used Unicode 0xFFFD (generic replacement character) for invalid bytes in CP1251 and CP1252, for CP1254 they used 0xFFFE, which is not a valid Unicode codepoint at all, but is a reversed byte-order mark. Probably this was by mistake. Anyways, - Fixed identify filter, which was completely wrong. - Don't convert Unicode 0xFFFE to a random (but valid) CP1254 byte. - When converting CP1254 to CP1254, don't pass invalid bytes through silently. --- .../libmbfl/filters/mbfilter_cp1254.c | 65 +++++-------------- .../libmbfl/filters/unicode_table_cp1254.h | 8 +-- ext/mbstring/libmbfl/mbfl/mbfl_consts.h | 1 - 3 files changed, 19 insertions(+), 55 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp1254.c b/ext/mbstring/libmbfl/filters/mbfilter_cp1254.c index eebd5b322ac..e0ca60e7c08 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp1254.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp1254.c @@ -74,79 +74,44 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp1254 = { #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) -/* - * wchar => cp1254 - */ -int -mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter) +int mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter) { - int s, n; - if (c < 0x80) { - s = c; + CK((*filter->output_function)(c, filter->data)); } else { - s = -1; - n = cp1254_ucs_table_len-1; - while (n >= 0) { - if (c == cp1254_ucs_table[n] && c != 0xfffe) { - s = cp1254_ucs_table_min + n; - break; + for (int n = 0; n < cp1254_ucs_table_len; n++) { + if (c == cp1254_ucs_table[n]) { + CK((*filter->output_function)(cp1254_ucs_table_min + n, filter->data)); + return c; } - n--; } - if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_CP1254) { - s = c & MBFL_WCSPLANE_MASK; - } - } - - if (s >= 0) { - CK((*filter->output_function)(s, filter->data)); - } else { CK(mbfl_filt_conv_illegal_output(c, filter)); } return c; } -/* - * cp1254 => wchar - */ -int -mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter) +int mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter) { int s; - if (c >= 0 && c < cp1254_ucs_table_min) { + if (c < cp1254_ucs_table_min) { s = c; - } else if (c >= cp1254_ucs_table_min && c < 0x100) { - s = cp1254_ucs_table[c - cp1254_ucs_table_min]; - if (s <= 0) { - s = c; - s &= MBFL_WCSPLANE_MASK; - s |= MBFL_WCSPLANE_CP1254; - } } else { - s = c; - s &= MBFL_WCSGROUP_MASK; - s |= MBFL_WCSGROUP_THROUGH; + s = cp1254_ucs_table[c - cp1254_ucs_table_min]; + if (!s) { + s = c | MBFL_WCSGROUP_THROUGH; + } } CK((*filter->output_function)(s, filter->data)); - return c; } -/* We only distinguish the MS extensions to ISO-8859-1. - * Actually, this is pretty much a NO-OP, since the identification - * system doesn't allow us to discriminate between a positive match, - * a possible match and a definite non-match. - * The problem here is that cp1254 looks like SJIS for certain chars. - * */ static int mbfl_filt_ident_cp1254(int c, mbfl_identify_filter *filter) { - if (c >= 0x80 && c < 0xff) - filter->flag = 0; - else - filter->flag = 1; /* not it */ + if (c >= 0x81 && c <= 0x9E && !cp1254_ucs_table[c - cp1254_ucs_table_min]) { + filter->flag = 1; + } return c; } diff --git a/ext/mbstring/libmbfl/filters/unicode_table_cp1254.h b/ext/mbstring/libmbfl/filters/unicode_table_cp1254.h index 7a9c4424add..1e70af33a2f 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_cp1254.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_cp1254.h @@ -27,10 +27,10 @@ /* cp1254 to Unicode table */ static const unsigned short cp1254_ucs_table[] = { - 0x20ac, 0xfffe, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, - 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0xfffe, 0xfffe, 0xfffe, - 0xfffe, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, - 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0xfffe, 0xfffe, 0x0178, + 0x20ac, 0x0000, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, + 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, + 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x0000, 0x0000, 0x0178, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h index f98015b5721..f94302f79d8 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h @@ -75,7 +75,6 @@ #define MBFL_WCSPLANE_8859_16 0x70fa0000 /* 00h - FFh */ #define MBFL_WCSPLANE_ARMSCII8 0x70fb0000 #define MBFL_WCSPLANE_KOI8U 0x70fc0000 -#define MBFL_WCSPLANE_CP1254 0x70fd0000 /* 00h - FFh */ #define MBFL_WCSPLANE_CP850 0x70fe0000 /* 00h - FFh */ #define MBFL_WCSPLANE_GB18030 0x70ff0000 /* a1a1h-e3329a35h */ #define MBFL_WCSGROUP_MASK 0xffffff