From d62f535caaddc7fe4d888df38283e3f24cc788e2 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 6 Jun 2022 22:44:58 +0200 Subject: [PATCH] Restore backwards-compatible mappings of 0x5C and 0x7E in SJIS According to the relevant Japan Industrial Standards Committee standards, SJIS 0x5C is a Yen sign, and 0x7E is an overline. However, this conflicts with the implementation of SJIS in various legacy software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken as equivalent to the same ASCII bytes. Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes compatibly with Microsoft products. This was changed in PHP 8.1.0, in an attempt to comply with the JISC specifications. However, after discussion with various concerned Japanese developers, it seems that the historical behavior was more useful in the majority of applications which process SJIS-encoded text. Since we are now treating SJIS 0x5C as equivalent to U+005C and 0x7E as equivalent to U+007E, it does not make sense to convert U+203E (OVERLINE) to 0x7E, nor does it make sense to convert U+00A5 (YEN SIGN) to 0x5C. Restore the mappings for those codepoints from before PHP 8.1.0. Fixes GH-8281. --- NEWS | 2 ++ ext/mbstring/libmbfl/filters/mbfilter_sjis.c | 22 +++----------- ext/mbstring/tests/sjis_encoding.phpt | 32 +++++++++++++++++--- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index ae3c521c60b..482ccc4a560 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,8 @@ PHP NEWS . mb_detect_encoding recognizes all letters in Czech alphabet (alexdowad) . mb_detect_encoding recognizes all letters in Hungarian alphabet (alexdowad) . Fixed bug GH-8685 (pcre not ready at mbstring startup). (Remi) + . Fixed bug GH-8281 (mb_convert_encoding "\" and "~" convert failed to + Shift_JIS). (alexdowad) - OPcache: . Fixed bug GH-8591 (tracing JIT crash after private instance method change). diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 188f162bf87..96456b26e7e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -141,11 +141,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter) switch (filter->status) { case 0: - if (c == 0x5C) { - CK((*filter->output_function)(0xA5, filter->data)); - } else if (c == 0x7E) { - CK((*filter->output_function)(0x203E, filter->data)); - } else if (c >= 0 && c < 0x80) { /* ASCII */ + if (c >= 0 && c < 0x80) { /* ASCII */ CK((*filter->output_function)(c, filter->data)); } else if (c > 0xA0 && c < 0xE0) { /* Kana */ CK((*filter->output_function)(0xFEC0 + c, filter->data)); @@ -197,17 +193,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) int c1, c2, s1, s2; s1 = 0; - if (c == 0x5C) { - /* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the - * Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */ - s1 = 0x2140; - } else if (c == 0x7E) { - /* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or - * macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */ - s1 = 0x2141; - } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ - s1 = 0x7E; /* Halfwidth overline/macron */ - } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { + if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; @@ -218,7 +204,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) } if (s1 <= 0) { if (c == 0xA5) { /* YEN SIGN */ - s1 = 0x5C; + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ + s1 = 0x2131; /* FULLWIDTH MACRON */ } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xFF5E) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/tests/sjis_encoding.phpt b/ext/mbstring/tests/sjis_encoding.phpt index aece04b0f22..c94cc464714 100644 --- a/ext/mbstring/tests/sjis_encoding.phpt +++ b/ext/mbstring/tests/sjis_encoding.phpt @@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) { $fromUnicode["\x00" . chr($i)] = chr($i); } -/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */ -$fromUnicode["\x00\x7E"] = "\x81\x60"; +/* According to the relevant Japan Industrial Standards Committee standards, + * SJIS 0x5C is a Yen sign, and 0x7E is an overline. + * + * However, this conflicts with the implementation of SJIS in various legacy + * software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken + * as equivalent to the same ASCII bytes. + * + * Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes + * compatibly with Microsoft products. This was changed in PHP 8.1.0, in an + * attempt to comply with the JISC specifications. However, after discussion + * with various concerned Japanese developers, it seems that the historical + * behavior was more useful in the majority of applications which process + * SJIS-encoded text. */ +$validChars["\x5C"] = "\x00\x5C"; +$validChars["\x7E"] = "\x00\x7E"; +$fromUnicode["\x00\x5C"] = "\x5C"; +$fromUnicode["\x00\x7E"] = "\x7E"; + +/* That means it does not make sense to convert U+203E (OVERLINE) + * to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */ +$fromUnicode["\x20\x3E"] = "\x81\x50"; +/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */ +$fromUnicode["\x00\xAF"] = "\x81\x50"; +/* Since we are treating 0x5C as equivalent to U+005C, it does not + * make sense to convert U+00A5 (YEN SIGN) to 0x5C + * Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */ +$fromUnicode["\x00\xA5"] = "\x81\x8F"; + /* DEL character */ $validChars["\x7F"] = "\x00\x7F"; $fromUnicode["\x00\x7F"] = "\x7F"; -/* U+00AF is MACRON; Shift-JIS 0x7E is overline */ -$fromUnicode["\x00\xAF"] = "\x7E"; /* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */ $validChars["\x81\x5F"] = "\xFF\x3C"; $fromUnicode["\xFF\x3C"] = "\x81\x5F";