1
0
mirror of https://github.com/php/php-src.git synced 2026-04-25 00:48:25 +02:00

Add identify filter for UTF-32{,BE,LE}

This commit is contained in:
Alex Dowad
2020-09-06 14:42:55 +02:00
parent d8895cd054
commit 7047e5d2c4
3 changed files with 153 additions and 0 deletions
@@ -30,6 +30,10 @@
#include "mbfilter.h"
#include "mbfilter_utf32.h"
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
const mbfl_encoding mbfl_encoding_utf32 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
&vtbl_wchar_utf32le
};
const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
mbfl_no_encoding_utf32,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_utf32
};
const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
mbfl_no_encoding_utf32be,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_utf32be
};
const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
mbfl_no_encoding_utf32le,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_utf32le
};
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
mbfl_no_encoding_utf32,
mbfl_no_encoding_wchar,
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
return c;
}
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
{
/* The largest valid codepoint is 0x10FFFF; we don't want values above that
* Neither do we want to see surrogates
* For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
switch (filter->status) {
case 0: /* 1st byte */
if (c == 0xff) {
filter->status = 1;
return c;
}
filter->filter_function = mbfl_filt_ident_utf32be;
break;
case 1: /* 2nd byte */
if (c == 0xfe) {
filter->status = 2;
return c;
}
filter->filter_function = mbfl_filt_ident_utf32be;
(filter->filter_function)(0xff, filter);
break;
case 2: /* 3rd byte */
if (c == 0) {
filter->status = 3;
return c;
}
filter->filter_function = mbfl_filt_ident_utf32be;
(filter->filter_function)(0xff, filter);
(filter->filter_function)(0xfe, filter);
break;
case 3: /* 4th byte */
if (c == 0) {
/* We found a little-endian byte-order mark! */
filter->status = 0;
filter->filter_function = mbfl_filt_ident_utf32le;
return c;
}
filter->filter_function = mbfl_filt_ident_utf32be;
(filter->filter_function)(0xff, filter);
(filter->filter_function)(0xfe, filter);
(filter->filter_function)(0, filter);
break;
}
return (filter->filter_function)(c, filter);
}
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
{
switch (filter->status) {
case 0: /* 1st byte */
filter->status = 1;
break;
case 1: /* 2nd byte */
if (c >= 0xD8 && c <= 0xDF) {
filter->status = 4; /* might be surrogate if we are in BMP */
} else {
filter->status = 2;
}
break;
case 2: /* 3rd byte */
if (c > 0x10) {
filter->flag = 1; /* too big */
}
filter->status = 3;
break;
case 3: /* 4th byte */
if (c) {
filter->flag = 1; /* too big */
}
filter->status = 0;
break;
case 4: /* 3rd byte, previous byte looked like surrogate */
if (!c) {
filter->flag = 1; /* yep, it's a surrogate */
}
filter->status = 3;
}
return c;
}
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
{
switch (filter->status) {
case 0: /* 1st byte */
if (c) {
filter->flag = 1; /* too big */
}
filter->status = 1;
break;
case 1: /* 2nd byte */
if (c > 0x10) {
filter->flag = 1; /* too big */
} if (c) {
filter->status = 4; /* not in the BMP */
} else {
filter->status = 2;
}
break;
case 2: /* 3rd byte */
if (c >= 0xD8 && c <= 0xDF) {
filter->flag = 1; /* reserved range for surrogates */
}
filter->status = 3;
break;
case 3: /* 4th byte */
filter->status = 0;
break;
case 4: /* 3rd byte, not in BMP */
filter->status = 3;
}
return c;
}
@@ -33,6 +33,9 @@
extern const mbfl_encoding mbfl_encoding_utf32;
extern const mbfl_encoding mbfl_encoding_utf32be;
extern const mbfl_encoding mbfl_encoding_utf32le;
extern const struct mbfl_identify_vtbl vtbl_identify_utf32;
extern const struct mbfl_identify_vtbl vtbl_identify_utf32be;
extern const struct mbfl_identify_vtbl vtbl_identify_utf32le;
extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;
+3
View File
@@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
&vtbl_identify_ucs2,
&vtbl_identify_ucs2be,
&vtbl_identify_ucs2le,
&vtbl_identify_utf32,
&vtbl_identify_utf32be,
&vtbl_identify_utf32le,
&vtbl_identify_false,
NULL
};