Add identify filter for UTF-32{,BE,LE}

2026-04-25 00:48:25 +02:00 · 2020-09-06 14:42:55 +02:00
parent d8895cd054
commit 7047e5d2c4
3 changed files with 153 additions and 0 deletions
@@ -30,6 +30,10 @@
 #include "mbfilter.h"
 #include "mbfilter_utf32.h"

+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
+
 static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};

 const mbfl_encoding mbfl_encoding_utf32 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
 	&vtbl_wchar_utf32le
 };

+const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
+	mbfl_no_encoding_utf32,
+	mbfl_filt_ident_common_ctor,
+	mbfl_filt_ident_utf32
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
+	mbfl_no_encoding_utf32be,
+	mbfl_filt_ident_common_ctor,
+	mbfl_filt_ident_utf32be
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
+	mbfl_no_encoding_utf32le,
+	mbfl_filt_ident_common_ctor,
+	mbfl_filt_ident_utf32le
+};
+
 const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
 	mbfl_no_encoding_utf32,
 	mbfl_no_encoding_wchar,
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)

 	return c;
 }
+
+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
+{
+	/* The largest valid codepoint is 0x10FFFF; we don't want values above that
+	 * Neither do we want to see surrogates
+	 * For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
+	switch (filter->status) {
+	case 0: /* 1st byte */
+		if (c == 0xff) {
+			filter->status = 1;
+			return c;
+		}
+		filter->filter_function = mbfl_filt_ident_utf32be;
+		break;
+
+	case 1: /* 2nd byte */
+		if (c == 0xfe) {
+			filter->status = 2;
+			return c;
+		}
+		filter->filter_function = mbfl_filt_ident_utf32be;
+		(filter->filter_function)(0xff, filter);
+		break;
+
+	case 2: /* 3rd byte */
+		if (c == 0) {
+			filter->status = 3;
+			return c;
+		}
+		filter->filter_function = mbfl_filt_ident_utf32be;
+		(filter->filter_function)(0xff, filter);
+		(filter->filter_function)(0xfe, filter);
+		break;
+
+	case 3: /* 4th byte */
+		if (c == 0) {
+			/* We found a little-endian byte-order mark! */
+			filter->status = 0;
+			filter->filter_function = mbfl_filt_ident_utf32le;
+			return c;
+		}
+		filter->filter_function = mbfl_filt_ident_utf32be;
+		(filter->filter_function)(0xff, filter);
+		(filter->filter_function)(0xfe, filter);
+		(filter->filter_function)(0, filter);
+		break;
+	}
+
+	return (filter->filter_function)(c, filter);
+}
+
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
+{
+	switch (filter->status) {
+	case 0: /* 1st byte */
+		filter->status = 1;
+		break;
+
+	case 1: /* 2nd byte */
+		if (c >= 0xD8 && c <= 0xDF) {
+			filter->status = 4; /* might be surrogate if we are in BMP */
+		} else {
+			filter->status = 2;
+		}
+		break;
+
+	case 2: /* 3rd byte */
+		if (c > 0x10) {
+			filter->flag = 1; /* too big */
+		}
+		filter->status = 3;
+		break;
+
+	case 3: /* 4th byte */
+		if (c) {
+			filter->flag = 1; /* too big */
+		}
+		filter->status = 0;
+		break;
+
+	case 4: /* 3rd byte, previous byte looked like surrogate */
+		if (!c) {
+			filter->flag = 1; /* yep, it's a surrogate */
+		}
+		filter->status = 3;
+	}
+	return c;
+}
+
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
+{
+	switch (filter->status) {
+	case 0: /* 1st byte */
+		if (c) {
+			filter->flag = 1; /* too big */
+		}
+		filter->status = 1;
+		break;
+
+	case 1: /* 2nd byte */
+		if (c > 0x10) {
+			filter->flag = 1; /* too big */
+		} if (c) {
+			filter->status = 4; /* not in the BMP */
+		} else {
+			filter->status = 2;
+		}
+		break;
+
+	case 2: /* 3rd byte */
+		if (c >= 0xD8 && c <= 0xDF) {
+			filter->flag = 1; /* reserved range for surrogates */
+		}
+		filter->status = 3;
+		break;
+
+	case 3: /* 4th byte */
+		filter->status = 0;
+		break;
+
+	case 4: /* 3rd byte, not in BMP */
+		filter->status = 3;
+	}
+	return c;
+}
@@ -33,6 +33,9 @@
 extern const mbfl_encoding mbfl_encoding_utf32;
 extern const mbfl_encoding mbfl_encoding_utf32be;
 extern const mbfl_encoding mbfl_encoding_utf32le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32be;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32le;
 extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
 extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
 extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;
@@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
 	&vtbl_identify_ucs2,
 	&vtbl_identify_ucs2be,
 	&vtbl_identify_ucs2le,
+	&vtbl_identify_utf32,
+	&vtbl_identify_utf32be,
+	&vtbl_identify_utf32le,
 	&vtbl_identify_false,
 	NULL
 };