mirror of
https://github.com/php/php-src.git
synced 2026-04-25 00:48:25 +02:00
Add identify filter for UTF-32{,BE,LE}
This commit is contained in:
@@ -30,6 +30,10 @@
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf32.h"
|
||||
|
||||
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
|
||||
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
|
||||
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
|
||||
|
||||
static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32 = {
|
||||
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
|
||||
&vtbl_wchar_utf32le
|
||||
};
|
||||
|
||||
const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
|
||||
mbfl_no_encoding_utf32,
|
||||
mbfl_filt_ident_common_ctor,
|
||||
mbfl_filt_ident_utf32
|
||||
};
|
||||
|
||||
const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
|
||||
mbfl_no_encoding_utf32be,
|
||||
mbfl_filt_ident_common_ctor,
|
||||
mbfl_filt_ident_utf32be
|
||||
};
|
||||
|
||||
const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
|
||||
mbfl_no_encoding_utf32le,
|
||||
mbfl_filt_ident_common_ctor,
|
||||
mbfl_filt_ident_utf32le
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
|
||||
mbfl_no_encoding_utf32,
|
||||
mbfl_no_encoding_wchar,
|
||||
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
|
||||
{
|
||||
/* The largest valid codepoint is 0x10FFFF; we don't want values above that
|
||||
* Neither do we want to see surrogates
|
||||
* For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
|
||||
switch (filter->status) {
|
||||
case 0: /* 1st byte */
|
||||
if (c == 0xff) {
|
||||
filter->status = 1;
|
||||
return c;
|
||||
}
|
||||
filter->filter_function = mbfl_filt_ident_utf32be;
|
||||
break;
|
||||
|
||||
case 1: /* 2nd byte */
|
||||
if (c == 0xfe) {
|
||||
filter->status = 2;
|
||||
return c;
|
||||
}
|
||||
filter->filter_function = mbfl_filt_ident_utf32be;
|
||||
(filter->filter_function)(0xff, filter);
|
||||
break;
|
||||
|
||||
case 2: /* 3rd byte */
|
||||
if (c == 0) {
|
||||
filter->status = 3;
|
||||
return c;
|
||||
}
|
||||
filter->filter_function = mbfl_filt_ident_utf32be;
|
||||
(filter->filter_function)(0xff, filter);
|
||||
(filter->filter_function)(0xfe, filter);
|
||||
break;
|
||||
|
||||
case 3: /* 4th byte */
|
||||
if (c == 0) {
|
||||
/* We found a little-endian byte-order mark! */
|
||||
filter->status = 0;
|
||||
filter->filter_function = mbfl_filt_ident_utf32le;
|
||||
return c;
|
||||
}
|
||||
filter->filter_function = mbfl_filt_ident_utf32be;
|
||||
(filter->filter_function)(0xff, filter);
|
||||
(filter->filter_function)(0xfe, filter);
|
||||
(filter->filter_function)(0, filter);
|
||||
break;
|
||||
}
|
||||
|
||||
return (filter->filter_function)(c, filter);
|
||||
}
|
||||
|
||||
static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
|
||||
{
|
||||
switch (filter->status) {
|
||||
case 0: /* 1st byte */
|
||||
filter->status = 1;
|
||||
break;
|
||||
|
||||
case 1: /* 2nd byte */
|
||||
if (c >= 0xD8 && c <= 0xDF) {
|
||||
filter->status = 4; /* might be surrogate if we are in BMP */
|
||||
} else {
|
||||
filter->status = 2;
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: /* 3rd byte */
|
||||
if (c > 0x10) {
|
||||
filter->flag = 1; /* too big */
|
||||
}
|
||||
filter->status = 3;
|
||||
break;
|
||||
|
||||
case 3: /* 4th byte */
|
||||
if (c) {
|
||||
filter->flag = 1; /* too big */
|
||||
}
|
||||
filter->status = 0;
|
||||
break;
|
||||
|
||||
case 4: /* 3rd byte, previous byte looked like surrogate */
|
||||
if (!c) {
|
||||
filter->flag = 1; /* yep, it's a surrogate */
|
||||
}
|
||||
filter->status = 3;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
|
||||
{
|
||||
switch (filter->status) {
|
||||
case 0: /* 1st byte */
|
||||
if (c) {
|
||||
filter->flag = 1; /* too big */
|
||||
}
|
||||
filter->status = 1;
|
||||
break;
|
||||
|
||||
case 1: /* 2nd byte */
|
||||
if (c > 0x10) {
|
||||
filter->flag = 1; /* too big */
|
||||
} if (c) {
|
||||
filter->status = 4; /* not in the BMP */
|
||||
} else {
|
||||
filter->status = 2;
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: /* 3rd byte */
|
||||
if (c >= 0xD8 && c <= 0xDF) {
|
||||
filter->flag = 1; /* reserved range for surrogates */
|
||||
}
|
||||
filter->status = 3;
|
||||
break;
|
||||
|
||||
case 3: /* 4th byte */
|
||||
filter->status = 0;
|
||||
break;
|
||||
|
||||
case 4: /* 3rd byte, not in BMP */
|
||||
filter->status = 3;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@@ -33,6 +33,9 @@
|
||||
extern const mbfl_encoding mbfl_encoding_utf32;
|
||||
extern const mbfl_encoding mbfl_encoding_utf32be;
|
||||
extern const mbfl_encoding mbfl_encoding_utf32le;
|
||||
extern const struct mbfl_identify_vtbl vtbl_identify_utf32;
|
||||
extern const struct mbfl_identify_vtbl vtbl_identify_utf32be;
|
||||
extern const struct mbfl_identify_vtbl vtbl_identify_utf32le;
|
||||
extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
|
||||
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
|
||||
extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;
|
||||
|
||||
@@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
|
||||
&vtbl_identify_ucs2,
|
||||
&vtbl_identify_ucs2be,
|
||||
&vtbl_identify_ucs2le,
|
||||
&vtbl_identify_utf32,
|
||||
&vtbl_identify_utf32be,
|
||||
&vtbl_identify_utf32le,
|
||||
&vtbl_identify_false,
|
||||
NULL
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user