mirror of
https://github.com/php/php-src.git
synced 2026-03-26 01:02:25 +01:00
Restore backwards-compatible mappings of 0x5C and 0x7E in SJIS
According to the relevant Japan Industrial Standards Committee standards, SJIS 0x5C is a Yen sign, and 0x7E is an overline. However, this conflicts with the implementation of SJIS in various legacy software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken as equivalent to the same ASCII bytes. Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes compatibly with Microsoft products. This was changed in PHP 8.1.0, in an attempt to comply with the JISC specifications. However, after discussion with various concerned Japanese developers, it seems that the historical behavior was more useful in the majority of applications which process SJIS-encoded text. Since we are now treating SJIS 0x5C as equivalent to U+005C and 0x7E as equivalent to U+007E, it does not make sense to convert U+203E (OVERLINE) to 0x7E, nor does it make sense to convert U+00A5 (YEN SIGN) to 0x5C. Restore the mappings for those codepoints from before PHP 8.1.0. Fixes GH-8281.
This commit is contained in:
committed by
Christoph M. Becker
parent
77ba689fd6
commit
d62f535caa
2
NEWS
2
NEWS
@@ -21,6 +21,8 @@ PHP NEWS
|
||||
. mb_detect_encoding recognizes all letters in Czech alphabet (alexdowad)
|
||||
. mb_detect_encoding recognizes all letters in Hungarian alphabet (alexdowad)
|
||||
. Fixed bug GH-8685 (pcre not ready at mbstring startup). (Remi)
|
||||
. Fixed bug GH-8281 (mb_convert_encoding "\" and "~" convert failed to
|
||||
Shift_JIS). (alexdowad)
|
||||
|
||||
- OPcache:
|
||||
. Fixed bug GH-8591 (tracing JIT crash after private instance method change).
|
||||
|
||||
@@ -141,11 +141,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
|
||||
|
||||
switch (filter->status) {
|
||||
case 0:
|
||||
if (c == 0x5C) {
|
||||
CK((*filter->output_function)(0xA5, filter->data));
|
||||
} else if (c == 0x7E) {
|
||||
CK((*filter->output_function)(0x203E, filter->data));
|
||||
} else if (c >= 0 && c < 0x80) { /* ASCII */
|
||||
if (c >= 0 && c < 0x80) { /* ASCII */
|
||||
CK((*filter->output_function)(c, filter->data));
|
||||
} else if (c > 0xA0 && c < 0xE0) { /* Kana */
|
||||
CK((*filter->output_function)(0xFEC0 + c, filter->data));
|
||||
@@ -197,17 +193,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
|
||||
int c1, c2, s1, s2;
|
||||
|
||||
s1 = 0;
|
||||
if (c == 0x5C) {
|
||||
/* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the
|
||||
* Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */
|
||||
s1 = 0x2140;
|
||||
} else if (c == 0x7E) {
|
||||
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
|
||||
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
|
||||
s1 = 0x2141;
|
||||
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
|
||||
s1 = 0x7E; /* Halfwidth overline/macron */
|
||||
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
|
||||
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
|
||||
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
|
||||
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
|
||||
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
|
||||
@@ -218,7 +204,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
|
||||
}
|
||||
if (s1 <= 0) {
|
||||
if (c == 0xA5) { /* YEN SIGN */
|
||||
s1 = 0x5C;
|
||||
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
|
||||
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
|
||||
s1 = 0x2131; /* FULLWIDTH MACRON */
|
||||
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
|
||||
s1 = 0x2140;
|
||||
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */
|
||||
|
||||
@@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) {
|
||||
$fromUnicode["\x00" . chr($i)] = chr($i);
|
||||
}
|
||||
|
||||
/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */
|
||||
$fromUnicode["\x00\x7E"] = "\x81\x60";
|
||||
/* According to the relevant Japan Industrial Standards Committee standards,
|
||||
* SJIS 0x5C is a Yen sign, and 0x7E is an overline.
|
||||
*
|
||||
* However, this conflicts with the implementation of SJIS in various legacy
|
||||
* software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
|
||||
* as equivalent to the same ASCII bytes.
|
||||
*
|
||||
* Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
|
||||
* compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
|
||||
* attempt to comply with the JISC specifications. However, after discussion
|
||||
* with various concerned Japanese developers, it seems that the historical
|
||||
* behavior was more useful in the majority of applications which process
|
||||
* SJIS-encoded text. */
|
||||
$validChars["\x5C"] = "\x00\x5C";
|
||||
$validChars["\x7E"] = "\x00\x7E";
|
||||
$fromUnicode["\x00\x5C"] = "\x5C";
|
||||
$fromUnicode["\x00\x7E"] = "\x7E";
|
||||
|
||||
/* That means it does not make sense to convert U+203E (OVERLINE)
|
||||
* to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
|
||||
$fromUnicode["\x20\x3E"] = "\x81\x50";
|
||||
/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
|
||||
$fromUnicode["\x00\xAF"] = "\x81\x50";
|
||||
/* Since we are treating 0x5C as equivalent to U+005C, it does not
|
||||
* make sense to convert U+00A5 (YEN SIGN) to 0x5C
|
||||
* Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
|
||||
$fromUnicode["\x00\xA5"] = "\x81\x8F";
|
||||
|
||||
/* DEL character */
|
||||
$validChars["\x7F"] = "\x00\x7F";
|
||||
$fromUnicode["\x00\x7F"] = "\x7F";
|
||||
/* U+00AF is MACRON; Shift-JIS 0x7E is overline */
|
||||
$fromUnicode["\x00\xAF"] = "\x7E";
|
||||
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
|
||||
$validChars["\x81\x5F"] = "\xFF\x3C";
|
||||
$fromUnicode["\xFF\x3C"] = "\x81\x5F";
|
||||
|
||||
Reference in New Issue
Block a user