mirror of
https://github.com/php/php-src.git
synced 2026-04-29 19:23:22 +02:00
ISO-2022-JP-MS treats truncated multi-byte chars as error
Sigh. I included tests which were intended to check this case in the test suite for ISO-2022-JP-MS, but those tests were faulty and didn't actually test what they were supposed to. Fixing the tests revealed that there were still bugs in this area.
This commit is contained in:
@@ -34,6 +34,8 @@
|
||||
#include "unicode_table_jis.h"
|
||||
#include "cp932_table.h"
|
||||
|
||||
static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter);
|
||||
|
||||
static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_2022jpms = {
|
||||
@@ -53,7 +55,7 @@ const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
|
||||
mbfl_filt_conv_common_ctor,
|
||||
NULL,
|
||||
mbfl_filt_conv_2022jpms_wchar,
|
||||
mbfl_filt_conv_common_flush,
|
||||
mbfl_filt_conv_2022jpms_wchar_flush,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -144,8 +146,7 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
|
||||
} else {
|
||||
if (c1 > 0x20 && c1 < 0x35) {
|
||||
w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
|
||||
}
|
||||
if (w <= 0) {
|
||||
} else {
|
||||
w = (((c1 - 0x21) + 0x7f) << 8) | c | MBFL_WCSPLANE_JIS0208;
|
||||
}
|
||||
}
|
||||
@@ -206,6 +207,30 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
|
||||
{
|
||||
if (filter->status & 0xF) {
|
||||
if ((filter->status & 0xF) == 2) {
|
||||
(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else if ((filter->status & 0xF) == 3) {
|
||||
(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else if ((filter->status & 0xF) == 4) {
|
||||
(*filter->output_function)(0x1B2428 | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else if ((filter->status & 0xF) == 5) {
|
||||
(*filter->output_function)(0x1B28 | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else {
|
||||
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
(*filter->flush_function)(filter->data);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cp932ext3_cp932ext2_jis(int c)
|
||||
{
|
||||
int idx;
|
||||
|
||||
@@ -55,7 +55,7 @@ foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C
|
||||
$udcChars = array();
|
||||
for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) {
|
||||
$i = $cp - 0xE000;
|
||||
$bytes = (( (int)($i / 94) + 0x7F - 0x5E) << 8) + (($i % 94) + 0x21);
|
||||
$bytes = (((int)($i / 94) + 0x21) << 8) + (($i % 94) + 0x21);
|
||||
$udcChars[pack('n', $bytes)] = pack('N', $cp);
|
||||
}
|
||||
|
||||
@@ -175,18 +175,16 @@ foreach (array_keys($truncatedChars) as $truncated)
|
||||
echo "JIS X 0208 (with MS extensions) support OK\n";
|
||||
|
||||
$validChars = $udcChars;
|
||||
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
|
||||
for ($i = 0; $i <= 0x7F; $i++)
|
||||
$validChars[chr($i)] = chr($i);
|
||||
for ($i = 0xA1; $i <= 0xDF; $i++)
|
||||
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
|
||||
$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
|
||||
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
|
||||
findInvalidChars($validChars, $invalidChars, $truncatedChars, array_fill_keys(range(0x21, 0x7F), 2));
|
||||
|
||||
testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true);
|
||||
|
||||
foreach (array_keys($invalidChars) as $invalid) {
|
||||
$firstByte = ord($invalid[0]);
|
||||
$firstByte = ord(substr($invalid, 0, 1));
|
||||
if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
|
||||
testInvalidString("\x1B\$(?" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
|
||||
} else {
|
||||
@@ -201,7 +199,15 @@ echo "UDC support OK\n";
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
|
||||
// Invalid escapes:
|
||||
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B(", "BAD+1B28", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B(.", "BAD+1B282E", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$(", "BAD+1B2428", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
|
||||
|
||||
echo "Done!\n";
|
||||
|
||||
Reference in New Issue
Block a user