mirror of
https://github.com/php/php-src.git
synced 2026-04-02 05:32:28 +02:00
Fix mbstring support for CP1254 encoding
One funny thing: while the original author used Unicode 0xFFFD (generic replacement character) for invalid bytes in CP1251 and CP1252, for CP1254 they used 0xFFFE, which is not a valid Unicode codepoint at all, but is a reversed byte-order mark. Probably this was by mistake. Anyways, - Fixed identify filter, which was completely wrong. - Don't convert Unicode 0xFFFE to a random (but valid) CP1254 byte. - When converting CP1254 to CP1254, don't pass invalid bytes through silently.
This commit is contained in:
@@ -74,79 +74,44 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp1254 = {
|
||||
|
||||
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
|
||||
|
||||
/*
|
||||
* wchar => cp1254
|
||||
*/
|
||||
int
|
||||
mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter)
|
||||
int mbfl_filt_conv_wchar_cp1254(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int s, n;
|
||||
|
||||
if (c < 0x80) {
|
||||
s = c;
|
||||
CK((*filter->output_function)(c, filter->data));
|
||||
} else {
|
||||
s = -1;
|
||||
n = cp1254_ucs_table_len-1;
|
||||
while (n >= 0) {
|
||||
if (c == cp1254_ucs_table[n] && c != 0xfffe) {
|
||||
s = cp1254_ucs_table_min + n;
|
||||
break;
|
||||
for (int n = 0; n < cp1254_ucs_table_len; n++) {
|
||||
if (c == cp1254_ucs_table[n]) {
|
||||
CK((*filter->output_function)(cp1254_ucs_table_min + n, filter->data));
|
||||
return c;
|
||||
}
|
||||
n--;
|
||||
}
|
||||
if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_CP1254) {
|
||||
s = c & MBFL_WCSPLANE_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
if (s >= 0) {
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
} else {
|
||||
CK(mbfl_filt_conv_illegal_output(c, filter));
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* cp1254 => wchar
|
||||
*/
|
||||
int
|
||||
mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter)
|
||||
int mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int s;
|
||||
|
||||
if (c >= 0 && c < cp1254_ucs_table_min) {
|
||||
if (c < cp1254_ucs_table_min) {
|
||||
s = c;
|
||||
} else if (c >= cp1254_ucs_table_min && c < 0x100) {
|
||||
s = cp1254_ucs_table[c - cp1254_ucs_table_min];
|
||||
if (s <= 0) {
|
||||
s = c;
|
||||
s &= MBFL_WCSPLANE_MASK;
|
||||
s |= MBFL_WCSPLANE_CP1254;
|
||||
}
|
||||
} else {
|
||||
s = c;
|
||||
s &= MBFL_WCSGROUP_MASK;
|
||||
s |= MBFL_WCSGROUP_THROUGH;
|
||||
s = cp1254_ucs_table[c - cp1254_ucs_table_min];
|
||||
if (!s) {
|
||||
s = c | MBFL_WCSGROUP_THROUGH;
|
||||
}
|
||||
}
|
||||
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/* We only distinguish the MS extensions to ISO-8859-1.
|
||||
* Actually, this is pretty much a NO-OP, since the identification
|
||||
* system doesn't allow us to discriminate between a positive match,
|
||||
* a possible match and a definite non-match.
|
||||
* The problem here is that cp1254 looks like SJIS for certain chars.
|
||||
* */
|
||||
static int mbfl_filt_ident_cp1254(int c, mbfl_identify_filter *filter)
|
||||
{
|
||||
if (c >= 0x80 && c < 0xff)
|
||||
filter->flag = 0;
|
||||
else
|
||||
filter->flag = 1; /* not it */
|
||||
if (c >= 0x81 && c <= 0x9E && !cp1254_ucs_table[c - cp1254_ucs_table_min]) {
|
||||
filter->flag = 1;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@@ -27,10 +27,10 @@
|
||||
|
||||
/* cp1254 to Unicode table */
|
||||
static const unsigned short cp1254_ucs_table[] = {
|
||||
0x20ac, 0xfffe, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
|
||||
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0xfffe, 0xfffe, 0xfffe,
|
||||
0xfffe, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
|
||||
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0xfffe, 0xfffe, 0x0178,
|
||||
0x20ac, 0x0000, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
|
||||
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
|
||||
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x0000, 0x0000, 0x0178,
|
||||
0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
|
||||
@@ -75,7 +75,6 @@
|
||||
#define MBFL_WCSPLANE_8859_16 0x70fa0000 /* 00h - FFh */
|
||||
#define MBFL_WCSPLANE_ARMSCII8 0x70fb0000
|
||||
#define MBFL_WCSPLANE_KOI8U 0x70fc0000
|
||||
#define MBFL_WCSPLANE_CP1254 0x70fd0000 /* 00h - FFh */
|
||||
#define MBFL_WCSPLANE_CP850 0x70fe0000 /* 00h - FFh */
|
||||
#define MBFL_WCSPLANE_GB18030 0x70ff0000 /* a1a1h-e3329a35h */
|
||||
#define MBFL_WCSGROUP_MASK 0xffffff
|
||||
|
||||
Reference in New Issue
Block a user