1
0
mirror of https://github.com/php/php-src.git synced 2026-04-26 01:18:19 +02:00

Fix error reporting bug for Unicode -> CP50220 conversion

To detect errors in conversion from Unicode to another text encoding, each
mbstring conversion filter object maintains a count of 'bad' characters. After
a conversion operation finishes, this count is checked to see if there was any
error.

The problem with CP50220 was that mbstring used a chain of two conversion filter
objects. The 'bad character count' would be incremented on the second object in
the chain, but this didn't do anything, as only the count on the first such
object is ever checked.

Fix this by implementing the conversion using a single conversion filter object,
rather than a chain of two. This is possible because of the recent refactoring,
which pulled out the needed logic for CP50220 conversion into a helper function.
This commit is contained in:
Alex Dowad
2021-01-20 15:52:40 +02:00
parent 1f130d4e58
commit ebe6500a0b
3 changed files with 45 additions and 29 deletions
+38 -28
View File
@@ -31,9 +31,9 @@
#include "unicode_table_jis.h"
#include "cp932_table.h"
static void mbfl_filt_conv_wchar_cp50220_ctor(mbfl_convert_filter *filt);
static void mbfl_filt_conv_wchar_cp50220_dtor(mbfl_convert_filter *filt);
static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
* This was just CP50220, but the implementation was less strict regarding
@@ -92,10 +92,10 @@ const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_cp50220,
mbfl_filt_conv_wchar_cp50220_ctor,
mbfl_filt_conv_wchar_cp50220_dtor,
mbfl_filt_conv_wchar_cp50221,
mbfl_filt_conv_any_jis_flush,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_cp50220,
mbfl_filt_conv_wchar_cp50220_flush,
NULL,
};
@@ -318,35 +318,45 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
return 0;
}
/*
* wchar => CP50220
*/
static void mbfl_filt_conv_wchar_cp50220_ctor(mbfl_convert_filter *filt)
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
{
/* Insert a new convert filter into the chain, after this one, which will
* actually perform the CP50220 conversion. Alter this filter so that it
* converts halfwidth katakana instead */
mbfl_convert_filter *cp50220_filt = emalloc(sizeof(mbfl_convert_filter));
*cp50220_filt = *filt;
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE, second = 0;
bool consumed = false;
/* Reinitialize */
mbfl_filt_conv_common_ctor(filt);
filt->filter_function = vtbl_tl_jisx0201_jisx0208.filter_function;
filt->filter_flush = (filter_flush_t)vtbl_tl_jisx0201_jisx0208.filter_flush;
filt->output_function = (output_function_t)cp50220_filt->filter_function;
filt->flush_function = (flush_function_t)cp50220_filt->filter_flush;
filt->data = cp50220_filt;
filt->opaque = (void*)(MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE);
if (filter->cache) {
int s = mbfl_convert_kana(filter->cache, c, &consumed, &second, mode);
filter->cache = consumed ? 0 : c;
mbfl_filt_conv_wchar_cp50221(s, filter);
if (second) {
mbfl_filt_conv_wchar_cp50221(second, filter);
}
} else if (c == 0) {
/* This case has to be handled separately, since `filter->cache == 0` means
* no codepoint is cached */
(*filter->output_function)(0, filter->data);
} else {
filter->cache = c;
}
return 0;
}
static void mbfl_filt_conv_wchar_cp50220_dtor(mbfl_convert_filter *filt)
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
{
efree(filt->data);
int mode = MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_GLUE, second = 0;
if (filter->cache) {
int s = mbfl_convert_kana(filter->cache, 0, NULL, &second, mode);
mbfl_filt_conv_wchar_cp50221(s, filter);
if (second) {
mbfl_filt_conv_wchar_cp50221(s, filter);
}
filter->cache = 0;
}
return mbfl_filt_conv_any_jis_flush(filter);
}
/*
* wchar => CP50221
*/
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
{
int s = 0;
@@ -44,7 +44,6 @@ extern const struct mbfl_convert_vtbl vtbl_cp50222_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50222;
int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter);
+7
View File
@@ -285,9 +285,16 @@ foreach ($fullwidthKatakana as $cp => $kuten) {
echo "Folding of fullwidth katakana for CP50220 OK\n";
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50220');
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50221');
testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
echo "Invalid Unicode is flagged when converting to CP5022x\n";
?>
--EXPECT--
ASCII support OK
JIS X 0201 support OK
CP932 support OK
Folding of fullwidth katakana for CP50220 OK
Invalid Unicode is flagged when converting to CP5022x