1
0
mirror of https://github.com/php/php-src.git synced 2026-04-21 23:18:13 +02:00
Files
archived-php-src/ext/mbstring/tests/sjis_encoding.phpt
T
Alex Dowad ad7e0f16cc Fix mbstring support for Shift-JIS
- Reject otherwise valid kuten codes which don't map to anything in JIS X 0208.
- Handle truncated multi-byte characters as an error.
- Convert Shift-JIS 0x7E to Unicode 0x203E (overline) as recommended by the
  Unicode Consortium, and as iconv does.
- Convert Shift-JIS 0x5C to Unicode 0xA5 (yen sign) as recommended by the
  Unicode Consortium, and as iconv does.
  (NOTE: This will affect PHP scripts which use an internal encoding of
  Shift-JIS! PHP assigns a special meaning to 0x5C, the backslash. For example,
  it is used for escapes in double-quoted strings. Mapping the Shift-JIS yen
  sign to the Unicode yen sign means the yen sign will not be usable for
  C escapes in double-quoted strings. Japanese PHP programmers who want to
  write their source code in Shift-JIS for some strange reason will have to
  use the JIS X 0208 backlash or 'REVERSE SOLIDUS' character for their C
  escapes.)
- Convert Unicode 0x5C (backslash) to Shift-JIS 0x815F (reverse solidus).
- Immediately handle error if first Shift-JIS byte is over 0xEF, rather than
  waiting to see the next byte. (Previously, the value used was 0xFC, which is
  the limit for the 2nd byte and not the 1st byte of a multi-byte character.)
- Don't allow 'control characters' to appear in the middle of a multi-byte
  character.

The test case for bug 47399 is now obsolete. That test assumed that a number
of Shift-JIS byte sequences which don't map to any character were 'valid'
(because the byte values were within the legal ranges).
2020-11-09 13:45:16 +02:00

62 lines
2.5 KiB
PHP

--TEST--
Exhaustive test of Shift-JIS encoding verification and conversion
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip mbstring not available');
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
srand(999); /* Make results consistent */
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'
/* Read in the table of all characters in Shift-JIS */
readConversionTable(__DIR__ . '/data/SHIFTJIS.txt', $validChars, $fromUnicode);
for ($i = 0; $i < 0x20; $i++) {
$validChars[chr($i)] = "\x00" . chr($i);
$fromUnicode["\x00" . chr($i)] = chr($i);
}
/* DEL character */
$validChars["\x7F"] = "\x00\x7F";
$fromUnicode["\x00\x7F"] = "\x7F";
/* Although Shift-JIS uses 0x7E for an overline, we will map Unicode 0x7E
* (tilde) to Shift-JIS 0x7E (as iconv does) */
$fromUnicode["\x00\x7E"] = "\x7E";
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
$validChars["\x81\x5F"] = "\xFF\x3C";
$fromUnicode["\xFF\x3C"] = "\x81\x5F";
/* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them
* to JIS X 0208 NOT SIGN */
$fromUnicode["\xFF\xE2"] = "\x81\xCA";
/* Likewise for fullwidth and halfwidth POUND SIGN */
$fromUnicode["\xFF\xE1"] = "\x81\x92";
/* Likewise for fullwidth and halfwidth CENT SIGN */
$fromUnicode["\xFF\xE0"] = "\x81\x91";
/* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */
$fromUnicode["\xFF\x5E"] = "\x81\x60";
/* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */
$fromUnicode["\xFF\x0D"] = "\x81\x7C";
/* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */
$fromUnicode["\x22\x25"] = "\x81\x61";
testAllValidChars($validChars, 'Shift-JIS', 'UTF-16BE');
echo "SJIS verification and conversion works on all valid characters\n";
findInvalidChars($validChars, $invalidChars, $truncated,
map(range(0x81, 0x9F), 2, map(range(0xE0, 0xEF), 2)));
testAllInvalidChars($invalidChars, $validChars, 'Shift-JIS', 'UTF-16BE', "\x00%");
testTruncatedChars($truncated, 'Shift-JIS', 'UTF-16BE', "\x00%");
echo "SJIS verification and conversion works on all invalid characters\n";
findInvalidChars($fromUnicode, $invalidChars, $unused, map(range(0, 0xFF), 2));
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
echo "Unicode -> SJIS conversion works on all invalid characters\n";
?>
--EXPECT--
SJIS verification and conversion works on all valid characters
SJIS verification and conversion works on all invalid characters
Unicode -> SJIS conversion works on all invalid characters