Improve error handling for UTF-16{,BE,LE}

Catch various errors such as the first part of a surrogate pair not being followed by a proper second part, the first part of a surrogate pair appearing at the end of a string, the second part of a surrogate pair appearing out of place, and so on.
2026-04-02 05:32:28 +02:00 · 2020-10-14 20:25:19 +02:00
parent d9ddeb6e85
commit d8895cd054
2 changed files with 77 additions and 94 deletions
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {

 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)

-/*
- * UTF-16 => wchar
- */
 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
 {
-	int n, endian;
-
-	endian = filter->status & 0xff00;
-	switch (filter->status & 0x0f) {
-	case 0:
-		if (endian) {
-			n = c & 0xff;
+	/* Start with the assumption that the string is big-endian;
+	 * If we find a little-endian BOM, then we will change that assumption */
+	if (filter->status == 0) {
+		filter->cache = c & 0xFF;
+		filter->status = 1;
+	} else {
+		int n = (filter->cache << 8) | (c & 0xFF);
+		if (n == 0xFFFE) {
+			/* Switch to little-endian mode */
+			filter->filter_function = mbfl_filt_conv_utf16le_wchar;
+			filter->cache = filter->status = 0;
 		} else {
-			n = (c & 0xff) << 8;
-		}
-		filter->cache |= n;
-		filter->status++;
-		break;
-	default:
-		if (endian) {
-			n = (c & 0xff) << 8;
-		} else {
-			n = c & 0xff;
-		}
-		n |= filter->cache & 0xffff;
-		filter->status &= ~0x0f;
-		if (n >= 0xd800 && n < 0xdc00) {
-			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-		} else if (n >= 0xdc00 && n < 0xe000) {
-			n &= 0x3ff;
-			n |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(n, filter->data));
-			} else {		/* illegal character */
-				n &= MBFL_WCSGROUP_MASK;
-				n |= MBFL_WCSGROUP_THROUGH;
+			filter->filter_function = mbfl_filt_conv_utf16be_wchar;
+			if (n >= 0xD800 && n <= 0xDBFF) {
+				filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+				filter->status = 2;
+				return c;
+			} else if (n >= 0xDC00 && n <= 0xDFFF) {
+				/* This is wrong; second part of surrogate pair has come first */
+				CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+			} else if (n != 0xFEFF) {
 				CK((*filter->output_function)(n, filter->data));
 			}
-		} else {
-			int is_first = filter->status & 0x10;
-			filter->cache = 0;
-			filter->status |= 0x10;
-			if (!is_first) {
-				if (n == 0xfffe) {
-					if (endian) {
-						filter->status &= ~0x100;		/* big-endian */
-					} else {
-						filter->status |= 0x100;		/* little-endian */
-					}
-					break;
-				} else if (n == 0xfeff) {
-					break;
-				}
-			}
-			CK((*filter->output_function)(n, filter->data));
+			filter->cache = filter->status = 0;
 		}
-		break;
 	}

 	return c;
 }

-/*
- * UTF-16BE => wchar
- */
 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
 {
 	int n;

 	switch (filter->status) {
-	case 0:
+	case 0: /* First byte */
+		filter->cache = c & 0xFF;
 		filter->status = 1;
-		n = (c & 0xff) << 8;
-		filter->cache |= n;
 		break;
-	default:
-		filter->status = 0;
-		n = (filter->cache & 0xff00) | (c & 0xff);
-		if (n >= 0xd800 && n < 0xdc00) {
-			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-		} else if (n >= 0xdc00 && n < 0xe000) {
-			n &= 0x3ff;
-			n |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(n, filter->data));
-			} else {		/* illegal character */
-				n &= MBFL_WCSGROUP_MASK;
-				n |= MBFL_WCSGROUP_THROUGH;
-				CK((*filter->output_function)(n, filter->data));
-			}
+
+	case 1: /* Second byte */
+		n = (filter->cache << 8) | (c & 0xFF);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			/* This is wrong; second part of surrogate pair has come first */
+			CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->status = 0;
 		} else {
-			filter->cache = 0;
 			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
 		}
 		break;
+
+	case 2: /* Second part of surrogate, first byte */
+		filter->cache = (filter->cache << 8) | (c & 0xFF);
+		filter->status = 3;
+		break;
+
+	case 3: /* Second part of surrogate, second byte */
+		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			/* Wrong; that's the first half of a surrogate pair, not the second */
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->cache = n & 0x3FF;
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		} else {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		}
 	}

 	return c;
 }

-/*
- * wchar => UTF-16BE
- */
 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
 {
 	int n;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
 	return c;
 }

-/*
- * UTF-16LE => wchar
- */
 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 {
+	int n;
+
 	switch (filter->status) {
 	case 0:
 		filter->cache = c & 0xff;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 			/* This is wrong; the second part of the surrogate pair has come first
 			 * Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
 			 * the error */
-			int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
-			filter->status = 0;
+			n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(n, filter->data));
-		} else {
 			filter->status = 0;
+		} else {
 			CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
+			filter->status = 0;
 		}
 		break;

@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 		break;

 	case 3:
-		filter->status = 0;
-		int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
-		CK((*filter->output_function)(n, filter->data));
+		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->cache = n & 0x3FF;
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			n = filter->cache + ((c & 0x3) << 8) + 0x10000;
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		} else {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		}
 		break;
 	}

 	return c;
 }

-/*
- * wchar => UTF-16LE
- */
 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
 {
 	int n;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
 	int cache = filter->cache;
 	filter->status = filter->cache = 0;

-	if (status & 0xF) {
+	if (status) {
 		/* Input string was truncated */
 		CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
 	}
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
 	size_t n;
 	unsigned char *p;
 	mbfl_convert_filter *filter;
-	int (*filter_function)(int c, mbfl_convert_filter *filter);

 	ZEND_ASSERT(convd);
 	ZEND_ASSERT(string);
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str

 	filter = convd->filter1;
 	if (filter != NULL) {
-		filter_function = filter->filter_function;
 		while (n > 0) {
-			if ((*filter_function)(*p++, filter) < 0) {
+			if ((*filter->filter_function)(*p++, filter) < 0) {
 				return p - string->val;
 			}
 			n--;