1
0
mirror of https://github.com/php/php-src.git synced 2026-04-29 19:23:22 +02:00

ISO-2022-JP-MS treats truncated multi-byte chars as error

Sigh. I included tests which were intended to check this case in the
test suite for ISO-2022-JP-MS, but those tests were faulty and didn't
actually test what they were supposed to.

Fixing the tests revealed that there were still bugs in this area.
This commit is contained in:
Alex Dowad
2021-08-21 21:14:01 +02:00
parent 57a81af041
commit 51e0d323e4
2 changed files with 40 additions and 9 deletions
@@ -34,6 +34,8 @@
#include "unicode_table_jis.h"
#include "cp932_table.h"
static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter);
static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
const mbfl_encoding mbfl_encoding_2022jpms = {
@@ -53,7 +55,7 @@ const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_2022jpms_wchar,
mbfl_filt_conv_common_flush,
mbfl_filt_conv_2022jpms_wchar_flush,
NULL,
};
@@ -144,8 +146,7 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
} else {
if (c1 > 0x20 && c1 < 0x35) {
w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
}
if (w <= 0) {
} else {
w = (((c1 - 0x21) + 0x7f) << 8) | c | MBFL_WCSPLANE_JIS0208;
}
}
@@ -206,6 +207,30 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
return c;
}
static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status & 0xF) {
if ((filter->status & 0xF) == 2) {
(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 3) {
(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 4) {
(*filter->output_function)(0x1B2428 | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 5) {
(*filter->output_function)(0x1B28 | MBFL_WCSGROUP_THROUGH, filter->data);
} else {
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
}
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
static int cp932ext3_cp932ext2_jis(int c)
{
int idx;
+12 -6
View File
@@ -55,7 +55,7 @@ foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C
$udcChars = array();
for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) {
$i = $cp - 0xE000;
$bytes = (( (int)($i / 94) + 0x7F - 0x5E) << 8) + (($i % 94) + 0x21);
$bytes = (((int)($i / 94) + 0x21) << 8) + (($i % 94) + 0x21);
$udcChars[pack('n', $bytes)] = pack('N', $cp);
}
@@ -175,18 +175,16 @@ foreach (array_keys($truncatedChars) as $truncated)
echo "JIS X 0208 (with MS extensions) support OK\n";
$validChars = $udcChars;
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
for ($i = 0; $i <= 0x7F; $i++)
$validChars[chr($i)] = chr($i);
for ($i = 0xA1; $i <= 0xDF; $i++)
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
findInvalidChars($validChars, $invalidChars, $truncatedChars, array_fill_keys(range(0x21, 0x7F), 2));
testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true);
foreach (array_keys($invalidChars) as $invalid) {
$firstByte = ord($invalid[0]);
$firstByte = ord(substr($invalid, 0, 1));
if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
testInvalidString("\x1B\$(?" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
} else {
@@ -201,7 +199,15 @@ echo "UDC support OK\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
// Invalid escapes:
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B(", "BAD+1B28", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B(.", "BAD+1B282E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(", "BAD+1B2428", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
echo "Done!\n";