mirror of
https://github.com/php/php-src.git
synced 2026-04-02 05:32:28 +02:00
Improve error handling for UTF-16{,BE,LE}
Catch various errors such as the first part of a surrogate pair not being followed by a proper second part, the first part of a surrogate pair appearing at the end of a string, the second part of a surrogate pair appearing out of place, and so on.
This commit is contained in:
@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
|
||||
|
||||
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
|
||||
|
||||
/*
|
||||
* UTF-16 => wchar
|
||||
*/
|
||||
int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int n, endian;
|
||||
|
||||
endian = filter->status & 0xff00;
|
||||
switch (filter->status & 0x0f) {
|
||||
case 0:
|
||||
if (endian) {
|
||||
n = c & 0xff;
|
||||
/* Start with the assumption that the string is big-endian;
|
||||
* If we find a little-endian BOM, then we will change that assumption */
|
||||
if (filter->status == 0) {
|
||||
filter->cache = c & 0xFF;
|
||||
filter->status = 1;
|
||||
} else {
|
||||
int n = (filter->cache << 8) | (c & 0xFF);
|
||||
if (n == 0xFFFE) {
|
||||
/* Switch to little-endian mode */
|
||||
filter->filter_function = mbfl_filt_conv_utf16le_wchar;
|
||||
filter->cache = filter->status = 0;
|
||||
} else {
|
||||
n = (c & 0xff) << 8;
|
||||
}
|
||||
filter->cache |= n;
|
||||
filter->status++;
|
||||
break;
|
||||
default:
|
||||
if (endian) {
|
||||
n = (c & 0xff) << 8;
|
||||
} else {
|
||||
n = c & 0xff;
|
||||
}
|
||||
n |= filter->cache & 0xffff;
|
||||
filter->status &= ~0x0f;
|
||||
if (n >= 0xd800 && n < 0xdc00) {
|
||||
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
|
||||
} else if (n >= 0xdc00 && n < 0xe000) {
|
||||
n &= 0x3ff;
|
||||
n |= (filter->cache & 0xfff0000) >> 6;
|
||||
filter->cache = 0;
|
||||
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
} else { /* illegal character */
|
||||
n &= MBFL_WCSGROUP_MASK;
|
||||
n |= MBFL_WCSGROUP_THROUGH;
|
||||
filter->filter_function = mbfl_filt_conv_utf16be_wchar;
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
|
||||
filter->status = 2;
|
||||
return c;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
/* This is wrong; second part of surrogate pair has come first */
|
||||
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
} else if (n != 0xFEFF) {
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
}
|
||||
} else {
|
||||
int is_first = filter->status & 0x10;
|
||||
filter->cache = 0;
|
||||
filter->status |= 0x10;
|
||||
if (!is_first) {
|
||||
if (n == 0xfffe) {
|
||||
if (endian) {
|
||||
filter->status &= ~0x100; /* big-endian */
|
||||
} else {
|
||||
filter->status |= 0x100; /* little-endian */
|
||||
}
|
||||
break;
|
||||
} else if (n == 0xfeff) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->cache = filter->status = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-16BE => wchar
|
||||
*/
|
||||
int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int n;
|
||||
|
||||
switch (filter->status) {
|
||||
case 0:
|
||||
case 0: /* First byte */
|
||||
filter->cache = c & 0xFF;
|
||||
filter->status = 1;
|
||||
n = (c & 0xff) << 8;
|
||||
filter->cache |= n;
|
||||
break;
|
||||
default:
|
||||
filter->status = 0;
|
||||
n = (filter->cache & 0xff00) | (c & 0xff);
|
||||
if (n >= 0xd800 && n < 0xdc00) {
|
||||
filter->cache = ((n & 0x3ff) << 16) + 0x400000;
|
||||
} else if (n >= 0xdc00 && n < 0xe000) {
|
||||
n &= 0x3ff;
|
||||
n |= (filter->cache & 0xfff0000) >> 6;
|
||||
filter->cache = 0;
|
||||
if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
} else { /* illegal character */
|
||||
n &= MBFL_WCSGROUP_MASK;
|
||||
n |= MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
}
|
||||
|
||||
case 1: /* Second byte */
|
||||
n = (filter->cache << 8) | (c & 0xFF);
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
|
||||
filter->status = 2;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
/* This is wrong; second part of surrogate pair has come first */
|
||||
CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->status = 0;
|
||||
} else {
|
||||
filter->cache = 0;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: /* Second part of surrogate, first byte */
|
||||
filter->cache = (filter->cache << 8) | (c & 0xFF);
|
||||
filter->status = 3;
|
||||
break;
|
||||
|
||||
case 3: /* Second part of surrogate, second byte */
|
||||
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
/* Wrong; that's the first half of a surrogate pair, not the second */
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n & 0x3FF;
|
||||
filter->status = 2;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
} else {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* wchar => UTF-16BE
|
||||
*/
|
||||
int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int n;
|
||||
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-16LE => wchar
|
||||
*/
|
||||
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int n;
|
||||
|
||||
switch (filter->status) {
|
||||
case 0:
|
||||
filter->cache = c & 0xff;
|
||||
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
|
||||
/* This is wrong; the second part of the surrogate pair has come first
|
||||
* Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
|
||||
* the error */
|
||||
int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
|
||||
filter->status = 0;
|
||||
n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
} else {
|
||||
filter->status = 0;
|
||||
} else {
|
||||
CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
|
||||
break;
|
||||
|
||||
case 3:
|
||||
filter->status = 0;
|
||||
int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n & 0x3FF;
|
||||
filter->status = 2;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
n = filter->cache + ((c & 0x3) << 8) + 0x10000;
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
} else {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* wchar => UTF-16LE
|
||||
*/
|
||||
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
|
||||
{
|
||||
int n;
|
||||
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
|
||||
int cache = filter->cache;
|
||||
filter->status = filter->cache = 0;
|
||||
|
||||
if (status & 0xF) {
|
||||
if (status) {
|
||||
/* Input string was truncated */
|
||||
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
|
||||
@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
|
||||
size_t n;
|
||||
unsigned char *p;
|
||||
mbfl_convert_filter *filter;
|
||||
int (*filter_function)(int c, mbfl_convert_filter *filter);
|
||||
|
||||
ZEND_ASSERT(convd);
|
||||
ZEND_ASSERT(string);
|
||||
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
|
||||
|
||||
filter = convd->filter1;
|
||||
if (filter != NULL) {
|
||||
filter_function = filter->filter_function;
|
||||
while (n > 0) {
|
||||
if ((*filter_function)(*p++, filter) < 0) {
|
||||
if ((*filter->filter_function)(*p++, filter) < 0) {
|
||||
return p - string->val;
|
||||
}
|
||||
n--;
|
||||
|
||||
Reference in New Issue
Block a user