1
0
mirror of https://github.com/php/php-src.git synced 2026-04-02 05:32:28 +02:00

Improve error handling for UTF-16{,BE,LE}

Catch various errors such as the first part of a surrogate pair not being
followed by a proper second part, the first part of a surrogate pair appearing
at the end of a string, the second part of a surrogate pair appearing out
of place, and so on.
This commit is contained in:
Alex Dowad
2020-10-14 20:25:19 +02:00
parent d9ddeb6e85
commit d8895cd054
2 changed files with 77 additions and 94 deletions

View File

@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
/*
* UTF-16 => wchar
*/
int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
{
int n, endian;
endian = filter->status & 0xff00;
switch (filter->status & 0x0f) {
case 0:
if (endian) {
n = c & 0xff;
/* Start with the assumption that the string is big-endian;
* If we find a little-endian BOM, then we will change that assumption */
if (filter->status == 0) {
filter->cache = c & 0xFF;
filter->status = 1;
} else {
int n = (filter->cache << 8) | (c & 0xFF);
if (n == 0xFFFE) {
/* Switch to little-endian mode */
filter->filter_function = mbfl_filt_conv_utf16le_wchar;
filter->cache = filter->status = 0;
} else {
n = (c & 0xff) << 8;
}
filter->cache |= n;
filter->status++;
break;
default:
if (endian) {
n = (c & 0xff) << 8;
} else {
n = c & 0xff;
}
n |= filter->cache & 0xffff;
filter->status &= ~0x0f;
if (n >= 0xd800 && n < 0xdc00) {
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
} else if (n >= 0xdc00 && n < 0xe000) {
n &= 0x3ff;
n |= (filter->cache & 0xfff0000) >> 6;
filter->cache = 0;
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(n, filter->data));
} else { /* illegal character */
n &= MBFL_WCSGROUP_MASK;
n |= MBFL_WCSGROUP_THROUGH;
filter->filter_function = mbfl_filt_conv_utf16be_wchar;
if (n >= 0xD800 && n <= 0xDBFF) {
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
filter->status = 2;
return c;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
/* This is wrong; second part of surrogate pair has come first */
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
} else if (n != 0xFEFF) {
CK((*filter->output_function)(n, filter->data));
}
} else {
int is_first = filter->status & 0x10;
filter->cache = 0;
filter->status |= 0x10;
if (!is_first) {
if (n == 0xfffe) {
if (endian) {
filter->status &= ~0x100; /* big-endian */
} else {
filter->status |= 0x100; /* little-endian */
}
break;
} else if (n == 0xfeff) {
break;
}
}
CK((*filter->output_function)(n, filter->data));
filter->cache = filter->status = 0;
}
break;
}
return c;
}
/*
* UTF-16BE => wchar
*/
int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
{
int n;
switch (filter->status) {
case 0:
case 0: /* First byte */
filter->cache = c & 0xFF;
filter->status = 1;
n = (c & 0xff) << 8;
filter->cache |= n;
break;
default:
filter->status = 0;
n = (filter->cache & 0xff00) | (c & 0xff);
if (n >= 0xd800 && n < 0xdc00) {
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
} else if (n >= 0xdc00 && n < 0xe000) {
n &= 0x3ff;
n |= (filter->cache & 0xfff0000) >> 6;
filter->cache = 0;
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(n, filter->data));
} else { /* illegal character */
n &= MBFL_WCSGROUP_MASK;
n |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(n, filter->data));
}
case 1: /* Second byte */
n = (filter->cache << 8) | (c & 0xFF);
if (n >= 0xD800 && n <= 0xDBFF) {
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
/* This is wrong; second part of surrogate pair has come first */
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
filter->status = 0;
} else {
filter->cache = 0;
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
}
break;
case 2: /* Second part of surrogate, first byte */
filter->cache = (filter->cache << 8) | (c & 0xFF);
filter->status = 3;
break;
case 3: /* Second part of surrogate, second byte */
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
if (n >= 0xD800 && n <= 0xDBFF) {
/* Wrong; that's the first half of a surrogate pair, not the second */
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n & 0x3FF;
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
} else {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
}
}
return c;
}
/*
* wchar => UTF-16BE
*/
int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
{
int n;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
return c;
}
/*
* UTF-16LE => wchar
*/
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
{
int n;
switch (filter->status) {
case 0:
filter->cache = c & 0xff;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
/* This is wrong; the second part of the surrogate pair has come first
* Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
* the error */
int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
filter->status = 0;
n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(n, filter->data));
} else {
filter->status = 0;
} else {
CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
filter->status = 0;
}
break;
@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
break;
case 3:
filter->status = 0;
int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
CK((*filter->output_function)(n, filter->data));
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
if (n >= 0xD800 && n <= 0xDBFF) {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n & 0x3FF;
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
n = filter->cache + ((c & 0x3) << 8) + 0x10000;
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
} else {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
}
break;
}
return c;
}
/*
* wchar => UTF-16LE
*/
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
{
int n;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
int cache = filter->cache;
filter->status = filter->cache = 0;
if (status & 0xF) {
if (status) {
/* Input string was truncated */
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
}

View File

@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
size_t n;
unsigned char *p;
mbfl_convert_filter *filter;
int (*filter_function)(int c, mbfl_convert_filter *filter);
ZEND_ASSERT(convd);
ZEND_ASSERT(string);
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
filter = convd->filter1;
if (filter != NULL) {
filter_function = filter->filter_function;
while (n > 0) {
if ((*filter_function)(*p++, filter) < 0) {
if ((*filter->filter_function)(*p++, filter) < 0) {
return p - string->val;
}
n--;