1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 08:12:21 +01:00

Remove unused conversion code from mbstring

Over the last few years, I refactored mbstring to perform encoding conversion
a buffer at a time, rather than a single byte at a time. This resulted in a
huge performance increase.

After the refactoring, the old "byte-at-a-time" code was retained for two
reasons:

1) It was used by the mailparse PECL extension.
2) It was used to implement mb_strcut for some text encodings.

However, after reviewing mailparse's use of mbstring, it is clear that
mailparse only relies on mbstring for decoding of QPrint, and possibly
Base64. It does not use the byte-at-a-time conversion code for any
other encoding.

Further, mb_strcut only relies on the byte-at-a-time conversion code
for a limited number of legacy text encodings, such as ISO-2022-JP,
HZ, UTF-7, etc.

Hence, we can remove over 5000 lines of unused code without breaking
anything. This will help to reduce binary size, and make the mbstring
codebase easier to navigate for new contributors.
This commit is contained in:
Alex Dowad
2026-01-11 08:23:59 +09:00
parent 11bec6b92f
commit c34b84ed81
14 changed files with 76 additions and 4919 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -42,8 +42,4 @@ int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd);
int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd);
int mbfilter_sjis_emoji_sb2unicode(int s, int *snd);
int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter);
int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter);
int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter);
#endif /* MBFL_MBFILTER_CJK_H */

View File

@@ -33,10 +33,5 @@
#include "mbfilter.h"
extern const mbfl_encoding mbfl_encoding_cp51932;
extern const struct mbfl_convert_vtbl vtbl_cp51932_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_cp51932;
int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter);
#endif /* MBFL_MBFILTER_CP51932_H */

View File

@@ -21,61 +21,10 @@ static inline uint32_t coalesce(uint32_t a, uint32_t b)
return a ? a : b;
}
/* Helper for single-byte encodings which use a conversion table */
static int mbfl_conv_singlebyte_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[])
{
if (c >= 0 && c < tbl_min) {
CK((*filter->output_function)(c, filter->data));
} else if (c < 0) {
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else {
CK((*filter->output_function)(coalesce(tbl[c - tbl_min], MBFL_BAD_INPUT), filter->data));
}
return 0;
}
static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[])
{
if (c >= 0 && c < tbl_min) {
CK((*filter->output_function)(c, filter->data));
} else if (c < 0 || c == MBFL_BAD_INPUT) {
CK(mbfl_filt_conv_illegal_output(c, filter));
} else {
for (int i = 0; i < 256 - tbl_min; i++) {
if (c == tbl[i]) {
CK((*filter->output_function)(i + tbl_min, filter->data));
return 0;
}
}
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
/* Initialize data structures for a single-byte encoding */
#define DEF_SB(id, name, mime_name, aliases) \
static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter); \
static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter); \
static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); \
static void mb_wchar_to_##id(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); \
static const struct mbfl_convert_vtbl vtbl_##id##_wchar = { \
mbfl_no_encoding_##id, \
mbfl_no_encoding_wchar, \
mbfl_filt_conv_common_ctor, \
NULL, \
mbfl_filt_conv_##id##_wchar, \
mbfl_filt_conv_common_flush, \
NULL \
}; \
static const struct mbfl_convert_vtbl vtbl_wchar_##id = { \
mbfl_no_encoding_wchar, \
mbfl_no_encoding_##id, \
mbfl_filt_conv_common_ctor, \
NULL, \
mbfl_filt_conv_wchar_##id, \
mbfl_filt_conv_common_flush, \
NULL \
}; \
const mbfl_encoding mbfl_encoding_##id = { \
mbfl_no_encoding_##id, \
name, \
@@ -83,8 +32,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
aliases, \
NULL, \
MBFL_ENCTYPE_SBCS, \
&vtbl_##id##_wchar, \
&vtbl_wchar_##id, \
NULL, \
NULL, \
mb_##id##_to_wchar, \
mb_wchar_to_##id, \
NULL, \
@@ -93,12 +42,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
/* For single-byte encodings which use a conversion table */
#define DEF_SB_TBL(id, name, mime_name, aliases, tbl_min, tbl) \
static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter) { \
return mbfl_conv_singlebyte_table(c, filter, tbl_min, tbl); \
} \
static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter) { \
return mbfl_conv_reverselookup_table(c, filter, tbl_min, tbl); \
} \
static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) \
{ \
unsigned char *p = *in, *e = p + *in_len; \
@@ -140,22 +83,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "IBM-367", "cp367", "csASCII", NULL};
DEF_SB(ascii, "ASCII", "US-ASCII", ascii_aliases);
static int mbfl_filt_conv_ascii_wchar(int c, mbfl_convert_filter *filter)
{
CK((*filter->output_function)((c < 0x80) ? c : MBFL_BAD_INPUT, filter->data));
return 0;
}
static int mbfl_filt_conv_wchar_ascii(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < 0x80 && c != MBFL_BAD_INPUT) {
CK((*filter->output_function)(c, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static size_t mb_ascii_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
@@ -195,21 +122,6 @@ static void mb_wchar_to_ascii(uint32_t *in, size_t len, mb_convert_buf *buf, boo
static const char *iso8859_1_aliases[] = {"ISO8859-1", "latin1", NULL};
DEF_SB(8859_1, "ISO-8859-1", "ISO-8859-1", iso8859_1_aliases);
static int mbfl_filt_conv_8859_1_wchar(int c, mbfl_convert_filter *filter)
{
return (*filter->output_function)(c, filter->data);
}
static int mbfl_filt_conv_wchar_8859_1(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < 0x100 && c != MBFL_BAD_INPUT) {
CK((*filter->output_function)(c, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static size_t mb_8859_1_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
@@ -494,38 +406,6 @@ static const unsigned short cp1252_ucs_table[] = {
};
DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases);
static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
{
if (c < 0 || c == MBFL_BAD_INPUT) {
CK(mbfl_filt_conv_illegal_output(c, filter));
} else if (c >= 0x100) {
for (int n = 0; n < 32; n++) {
if (c == cp1252_ucs_table[n]) {
CK((*filter->output_function)(0x80 + n, filter->data));
return 0;
}
}
CK(mbfl_filt_conv_illegal_output(c, filter));
} else if (c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) {
CK((*filter->output_function)(c, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter)
{
int s;
if (c >= 0x80 && c < 0xA0) {
s = coalesce(cp1252_ucs_table[c - 0x80], MBFL_BAD_INPUT);
} else {
s = c;
}
CK((*filter->output_function)(s, filter->data));
return 0;
}
static size_t mb_cp1252_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
@@ -701,32 +581,6 @@ static const unsigned char ucs_armscii8_table[] = {
};
DEF_SB(armscii8, "ArmSCII-8", "ArmSCII-8", armscii8_aliases);
static int mbfl_filt_conv_armscii8_wchar(int c, mbfl_convert_filter *filter)
{
CK((*filter->output_function)((c < 0xA0) ? c : coalesce(armscii8_ucs_table[c - 0xA0], MBFL_BAD_INPUT), filter->data));
return 0;
}
static int mbfl_filt_conv_wchar_armscii8(int c, mbfl_convert_filter *filter)
{
if (c >= 0x28 && c <= 0x2F) {
CK((*filter->output_function)(ucs_armscii8_table[c - 0x28], filter->data));
} else if (c < 0 || c == MBFL_BAD_INPUT) {
CK(mbfl_filt_conv_illegal_output(c, filter));
} else if (c < 0xA0) {
CK((*filter->output_function)(c, filter->data));
} else {
for (int n = 0; n < 0x60; n++) {
if (c == armscii8_ucs_table[n]) {
CK((*filter->output_function)(0xA0 + n, filter->data));
return 0;
}
}
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static size_t mb_armscii8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;

View File

@@ -30,7 +30,6 @@
#include "mbfilter.h"
#include "mbfilter_ucs2.h"
static int mbfl_filt_conv_ucs2_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_ucs2_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static size_t mb_ucs2be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_ucs2be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
@@ -53,8 +52,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
mbfl_encoding_ucs2_aliases,
NULL,
MBFL_ENCTYPE_WCS2,
&vtbl_ucs2_wchar,
&vtbl_wchar_ucs2,
NULL,
NULL,
mb_ucs2_to_wchar,
mb_wchar_to_ucs2be,
NULL,
@@ -68,8 +67,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
mbfl_encoding_ucs2be_aliases,
NULL,
MBFL_ENCTYPE_WCS2,
&vtbl_ucs2be_wchar,
&vtbl_wchar_ucs2be,
NULL,
NULL,
mb_ucs2be_to_wchar,
mb_wchar_to_ucs2be,
NULL,
@@ -83,158 +82,14 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
mbfl_encoding_ucs2le_aliases,
NULL,
MBFL_ENCTYPE_WCS2,
&vtbl_ucs2le_wchar,
&vtbl_wchar_ucs2le,
NULL,
NULL,
mb_ucs2le_to_wchar,
mb_wchar_to_ucs2le,
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {
mbfl_no_encoding_ucs2,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs2_wchar,
mbfl_filt_conv_ucs2_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs2 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs2,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs2be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs2be_wchar = {
mbfl_no_encoding_ucs2be,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs2be_wchar,
mbfl_filt_conv_ucs2_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs2be = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs2be,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs2be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs2le_wchar = {
mbfl_no_encoding_ucs2le,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs2le_wchar,
mbfl_filt_conv_ucs2_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs2le = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs2le,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs2le,
mbfl_filt_conv_common_flush,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
int mbfl_filt_conv_ucs2_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status == 0) {
filter->status = 1;
filter->cache = c & 0xFF;
} else {
filter->status = 0;
int n = (filter->cache << 8) | (c & 0xFF);
if (n == 0xFFFE) {
/* Found little-endian byte order mark */
filter->filter_function = mbfl_filt_conv_ucs2le_wchar;
} else {
filter->filter_function = mbfl_filt_conv_ucs2be_wchar;
if (n != 0xFEFF) {
CK((*filter->output_function)(n, filter->data));
}
}
}
return 0;
}
int mbfl_filt_conv_ucs2be_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status == 0) {
filter->status = 1;
filter->cache = (c & 0xFF) << 8;
} else {
filter->status = 0;
CK((*filter->output_function)((c & 0xFF) | filter->cache, filter->data));
}
return 0;
}
int mbfl_filt_conv_wchar_ucs2be(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
CK((*filter->output_function)((c >> 8) & 0xFF, filter->data));
CK((*filter->output_function)(c & 0xFF, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
int mbfl_filt_conv_ucs2le_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status == 0) {
filter->status = 1;
filter->cache = c & 0xFF;
} else {
filter->status = 0;
CK((*filter->output_function)(((c & 0xFF) << 8) | filter->cache, filter->data));
}
return 0;
}
int mbfl_filt_conv_wchar_ucs2le(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
CK((*filter->output_function)(c & 0xFF, filter->data));
CK((*filter->output_function)((c >> 8) & 0xFF, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static int mbfl_filt_conv_ucs2_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status) {
/* Input string was truncated */
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
}
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
#define DETECTED_BE 1
#define DETECTED_LE 2

View File

@@ -35,17 +35,5 @@
extern const mbfl_encoding mbfl_encoding_ucs2;
extern const mbfl_encoding mbfl_encoding_ucs2be;
extern const mbfl_encoding mbfl_encoding_ucs2le;
extern const struct mbfl_convert_vtbl vtbl_ucs2_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2;
extern const struct mbfl_convert_vtbl vtbl_ucs2be_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2be;
extern const struct mbfl_convert_vtbl vtbl_ucs2le_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2le;
int mbfl_filt_conv_ucs2_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_ucs2be_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_ucs2be(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_ucs2le_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_ucs2le(int c, mbfl_convert_filter *filter);
#endif /* MBFL_MBFILTER_UCS2_H */

View File

@@ -44,8 +44,6 @@ static const char *mbfl_encoding_ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NU
static const char *mbfl_encoding_ucs4be_aliases[] = {"byte4be", NULL};
static const char *mbfl_encoding_ucs4le_aliases[] = {"byte4le", NULL};
static int mbfl_filt_conv_ucs4_wchar_flush(mbfl_convert_filter *filter);
const mbfl_encoding mbfl_encoding_ucs4 = {
mbfl_no_encoding_ucs4,
"UCS-4",
@@ -53,8 +51,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
mbfl_encoding_ucs4_aliases,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_ucs4_wchar,
&vtbl_wchar_ucs4,
NULL,
NULL,
mb_ucs4_to_wchar,
mb_wchar_to_ucs4be,
NULL,
@@ -68,8 +66,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
mbfl_encoding_ucs4be_aliases,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_ucs4be_wchar,
&vtbl_wchar_ucs4be,
NULL,
NULL,
mb_ucs4be_to_wchar,
mb_wchar_to_ucs4be,
NULL,
@@ -83,239 +81,14 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
mbfl_encoding_ucs4le_aliases,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_ucs4le_wchar,
&vtbl_wchar_ucs4le,
NULL,
NULL,
mb_ucs4le_to_wchar,
mb_wchar_to_ucs4le,
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {
mbfl_no_encoding_ucs4,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs4_wchar,
mbfl_filt_conv_ucs4_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs4 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs4,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs4be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs4be_wchar = {
mbfl_no_encoding_ucs4be,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs4be_wchar,
mbfl_filt_conv_ucs4_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs4be = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs4be,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs4be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs4le_wchar = {
mbfl_no_encoding_ucs4le,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_ucs4le_wchar,
mbfl_filt_conv_ucs4_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_ucs4le = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_ucs4le,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_ucs4le,
mbfl_filt_conv_common_flush,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
/*
* UCS-4 => wchar
*/
int mbfl_filt_conv_ucs4_wchar(int c, mbfl_convert_filter *filter)
{
int n, endian;
endian = filter->status & 0xff00;
switch (filter->status & 0xff) {
case 0:
if (endian) {
n = c & 0xff;
} else {
n = (c & 0xffu) << 24;
}
filter->cache = n;
filter->status++;
break;
case 1:
if (endian) {
n = (c & 0xff) << 8;
} else {
n = (c & 0xff) << 16;
}
filter->cache |= n;
filter->status++;
break;
case 2:
if (endian) {
n = (c & 0xff) << 16;
} else {
n = (c & 0xff) << 8;
}
filter->cache |= n;
filter->status++;
break;
default:
if (endian) {
n = (c & 0xffu) << 24;
} else {
n = c & 0xff;
}
n |= filter->cache;
filter->status &= ~0xff;
if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) {
if (endian) {
filter->status = 0; /* big-endian */
} else {
filter->status = 0x100; /* little-endian */
}
} else if (n != 0xfeff) {
CK((*filter->output_function)(n, filter->data));
}
break;
}
return 0;
}
/*
* UCS-4BE => wchar
*/
int mbfl_filt_conv_ucs4be_wchar(int c, mbfl_convert_filter *filter)
{
int n;
if (filter->status == 0) {
filter->status = 1;
n = (c & 0xffu) << 24;
filter->cache = n;
} else if (filter->status == 1) {
filter->status = 2;
n = (c & 0xff) << 16;
filter->cache |= n;
} else if (filter->status == 2) {
filter->status = 3;
n = (c & 0xff) << 8;
filter->cache |= n;
} else {
filter->status = 0;
n = (c & 0xff) | filter->cache;
CK((*filter->output_function)(n, filter->data));
}
return 0;
}
/*
* wchar => UCS-4BE
*/
int mbfl_filt_conv_wchar_ucs4be(int c, mbfl_convert_filter *filter)
{
if (c != MBFL_BAD_INPUT) {
CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
CK((*filter->output_function)(c & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
/*
* UCS-4LE => wchar
*/
int mbfl_filt_conv_ucs4le_wchar(int c, mbfl_convert_filter *filter)
{
int n;
if (filter->status == 0) {
filter->status = 1;
n = (c & 0xff);
filter->cache = n;
} else if (filter->status == 1) {
filter->status = 2;
n = (c & 0xff) << 8;
filter->cache |= n;
} else if (filter->status == 2) {
filter->status = 3;
n = (c & 0xff) << 16;
filter->cache |= n;
} else {
filter->status = 0;
n = ((c & 0xffu) << 24) | filter->cache;
CK((*filter->output_function)(n, filter->data));
}
return 0;
}
/*
* wchar => UCS-4LE
*/
int mbfl_filt_conv_wchar_ucs4le(int c, mbfl_convert_filter *filter)
{
if (c != MBFL_BAD_INPUT) {
CK((*filter->output_function)(c & 0xff, filter->data));
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static int mbfl_filt_conv_ucs4_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status & 0xF) {
/* Input string was truncated */
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
}
filter->status = 0;
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
#define DETECTED_BE 1
#define DETECTED_LE 2

View File

@@ -33,17 +33,5 @@
extern const mbfl_encoding mbfl_encoding_ucs4;
extern const mbfl_encoding mbfl_encoding_ucs4le;
extern const mbfl_encoding mbfl_encoding_ucs4be;
extern const struct mbfl_convert_vtbl vtbl_ucs4_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4;
extern const struct mbfl_convert_vtbl vtbl_ucs4be_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4be;
extern const struct mbfl_convert_vtbl vtbl_ucs4le_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4le;
int mbfl_filt_conv_ucs4_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_ucs4be_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_ucs4be(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_ucs4le_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_ucs4le(int c, mbfl_convert_filter *filter);
#endif /* MBFL_MBFILTER_UCS4_H */

View File

@@ -173,7 +173,6 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf
#endif
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end);
static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end);
@@ -188,8 +187,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
mbfl_encoding_utf16_aliases,
NULL,
0,
&vtbl_utf16_wchar,
&vtbl_wchar_utf16,
NULL,
NULL,
mb_utf16_to_wchar,
mb_wchar_to_utf16be,
NULL,
@@ -203,8 +202,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
NULL,
NULL,
0,
&vtbl_utf16be_wchar,
&vtbl_wchar_utf16be,
NULL,
NULL,
mb_utf16be_to_wchar,
mb_wchar_to_utf16be,
NULL,
@@ -218,270 +217,14 @@ const mbfl_encoding mbfl_encoding_utf16le = {
NULL,
NULL,
0,
&vtbl_utf16le_wchar,
&vtbl_wchar_utf16le,
NULL,
NULL,
mb_utf16le_to_wchar,
mb_wchar_to_utf16le,
NULL,
mb_cut_utf16le
};
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
mbfl_no_encoding_utf16,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf16_wchar,
mbfl_filt_conv_utf16_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf16,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf16be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
mbfl_no_encoding_utf16be,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf16be_wchar,
mbfl_filt_conv_utf16_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf16be,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf16be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
mbfl_no_encoding_utf16le,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf16le_wchar,
mbfl_filt_conv_utf16_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf16le,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf16le,
mbfl_filt_conv_common_flush,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
{
/* Start with the assumption that the string is big-endian;
* If we find a little-endian BOM, then we will change that assumption */
if (filter->status == 0) {
filter->cache = c & 0xFF;
filter->status = 1;
} else {
int n = (filter->cache << 8) | (c & 0xFF);
filter->cache = filter->status = 0;
if (n == 0xFFFE) {
/* Switch to little-endian mode */
filter->filter_function = mbfl_filt_conv_utf16le_wchar;
} else {
filter->filter_function = mbfl_filt_conv_utf16be_wchar;
if (n >= 0xD800 && n <= 0xDBFF) {
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
filter->status = 2;
return 0;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
/* This is wrong; second part of surrogate pair has come first */
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else if (n != 0xFEFF) {
CK((*filter->output_function)(n, filter->data));
}
}
}
return 0;
}
int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
{
int n;
switch (filter->status) {
case 0: /* First byte */
filter->cache = c & 0xFF;
filter->status = 1;
break;
case 1: /* Second byte */
n = (filter->cache << 8) | (c & 0xFF);
if (n >= 0xD800 && n <= 0xDBFF) {
filter->cache = n & 0x3FF; /* Pick out 10 data bits */
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
/* This is wrong; second part of surrogate pair has come first */
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else {
filter->status = 0;
CK((*filter->output_function)(n, filter->data));
}
break;
case 2: /* Second part of surrogate, first byte */
filter->cache = (filter->cache << 8) | (c & 0xFF);
filter->status = 3;
break;
case 3: /* Second part of surrogate, second byte */
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
if (n >= 0xD800 && n <= 0xDBFF) {
/* Wrong; that's the first half of a surrogate pair, not the second */
filter->cache = n & 0x3FF;
filter->status = 2;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else if (n >= 0xDC00 && n <= 0xDFFF) {
filter->status = 0;
n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
CK((*filter->output_function)(n, filter->data));
} else {
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
CK((*filter->output_function)(n, filter->data));
}
}
return 0;
}
int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
{
int n;
if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
CK((*filter->output_function)(c & 0xff, filter->data));
} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
n = ((c >> 10) - 0x40) | 0xd800;
CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
CK((*filter->output_function)(n & 0xff, filter->data));
n = (c & 0x3ff) | 0xdc00;
CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
CK((*filter->output_function)(n & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
{
int n;
switch (filter->status) {
case 0:
filter->cache = c & 0xff;
filter->status = 1;
break;
case 1:
if ((c & 0xfc) == 0xd8) {
/* Looks like we have a surrogate pair here */
filter->cache += ((c & 0x3) << 8);
filter->status = 2;
} else if ((c & 0xfc) == 0xdc) {
/* This is wrong; the second part of the surrogate pair has come first */
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else {
filter->status = 0;
CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
}
break;
case 2:
filter->cache = (filter->cache << 10) + (c & 0xff);
filter->status = 3;
break;
case 3:
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
if (n >= 0xD800 && n <= 0xDBFF) {
/* We previously saw the first part of a surrogate pair and were
* expecting the second part; this is another first part */
filter->cache = n & 0x3FF;
filter->status = 2;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
} else if (n >= 0xDC00 && n <= 0xDFFF) {
n = filter->cache + ((c & 0x3) << 8) + 0x10000;
filter->status = 0;
CK((*filter->output_function)(n, filter->data));
} else {
/* The first part of a surrogate pair was followed by some other codepoint
* which is not part of a surrogate pair at all */
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
CK((*filter->output_function)(n, filter->data));
}
break;
}
return 0;
}
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
{
int n;
if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
CK((*filter->output_function)(c & 0xff, filter->data));
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
n = ((c >> 10) - 0x40) | 0xd800;
CK((*filter->output_function)(n & 0xff, filter->data));
CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
n = (c & 0x3ff) | 0xdc00;
CK((*filter->output_function)(n & 0xff, filter->data));
CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status) {
/* Input string was truncated */
filter->status = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
}
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
#define DETECTED_BE 1
#define DETECTED_LE 2

View File

@@ -34,19 +34,6 @@ extern const mbfl_encoding mbfl_encoding_utf16;
extern const mbfl_encoding mbfl_encoding_utf16be;
extern const mbfl_encoding mbfl_encoding_utf16le;
extern const struct mbfl_convert_vtbl vtbl_utf16_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf16;
extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf16be;
extern const struct mbfl_convert_vtbl vtbl_utf16le_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf16le;
int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter);
#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
void init_convert_utf16(void);
#endif

View File

@@ -30,7 +30,6 @@
#include "mbfilter.h"
#include "mbfilter_utf32.h"
static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf32_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static size_t mb_utf32be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf32be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
@@ -46,8 +45,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
mbfl_encoding_utf32_aliases,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_utf32_wchar,
&vtbl_wchar_utf32,
NULL,
NULL,
mb_utf32_to_wchar,
mb_wchar_to_utf32be,
NULL,
@@ -61,8 +60,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
NULL,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_utf32be_wchar,
&vtbl_wchar_utf32be,
NULL,
NULL,
mb_utf32be_to_wchar,
mb_wchar_to_utf32be,
NULL,
@@ -76,178 +75,14 @@ const mbfl_encoding mbfl_encoding_utf32le = {
NULL,
NULL,
MBFL_ENCTYPE_WCS4,
&vtbl_utf32le_wchar,
&vtbl_wchar_utf32le,
NULL,
NULL,
mb_utf32le_to_wchar,
mb_wchar_to_utf32le,
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
mbfl_no_encoding_utf32,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf32_wchar,
mbfl_filt_conv_utf32_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf32 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf32,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf32be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf32be_wchar = {
mbfl_no_encoding_utf32be,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf32be_wchar,
mbfl_filt_conv_utf32_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf32be = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf32be,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf32be,
mbfl_filt_conv_common_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf32le_wchar = {
mbfl_no_encoding_utf32le,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf32le_wchar,
mbfl_filt_conv_utf32_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf32le,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf32le,
mbfl_filt_conv_common_flush,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
{
if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
CK((*filter->output_function)(n, filter->data));
} else {
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
}
return 0;
}
int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status < 3) {
filter->cache = (filter->cache << 8) | (c & 0xFF);
filter->status++;
} else {
int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
filter->cache = filter->status = 0;
if (n == 0xFFFE0000) {
/* Found a little-endian byte order mark */
filter->filter_function = mbfl_filt_conv_utf32le_wchar;
} else {
filter->filter_function = mbfl_filt_conv_utf32be_wchar;
if (n != 0xFEFF) {
CK(emit_char_if_valid(n, filter));
}
}
}
return 0;
}
int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status < 3) {
filter->cache = (filter->cache << 8) | (c & 0xFF);
filter->status++;
} else {
int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
filter->cache = filter->status = 0;
CK(emit_char_if_valid(n, filter));
}
return 0;
}
int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
CK((*filter->output_function)(c & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter)
{
if (filter->status < 3) {
filter->cache |= ((c & 0xFFU) << (8 * filter->status));
filter->status++;
} else {
int n = ((c & 0xFFU) << 24) | filter->cache;
filter->cache = filter->status = 0;
CK(emit_char_if_valid(n, filter));
}
return 0;
}
int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
CK((*filter->output_function)(c & 0xff, filter->data));
CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status) {
/* Input string was truncated */
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
}
filter->cache = filter->status = 0;
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
#define DETECTED_BE 1
#define DETECTED_LE 2

View File

@@ -34,17 +34,4 @@ extern const mbfl_encoding mbfl_encoding_utf32;
extern const mbfl_encoding mbfl_encoding_utf32be;
extern const mbfl_encoding mbfl_encoding_utf32le;
extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32be;
extern const struct mbfl_convert_vtbl vtbl_utf32le_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf32le;
int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter);
#endif /* MBFL_MBFILTER_UTF32_H */

View File

@@ -52,14 +52,6 @@ const unsigned char mblen_table_utf8[] = {
};
extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
extern int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter);
static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter);
static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter);
static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter);
static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
@@ -143,34 +135,14 @@ const mbfl_encoding mbfl_encoding_utf8 = {
mbfl_encoding_utf8_aliases,
mblen_table_utf8,
0,
&vtbl_utf8_wchar,
&vtbl_wchar_utf8,
NULL,
NULL,
mb_utf8_to_wchar,
mb_wchar_to_utf8,
NULL,
mb_cut_utf8
};
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
mbfl_no_encoding_utf8,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf8_wchar,
mbfl_filt_conv_utf8_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf8,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf8,
mbfl_filt_conv_common_flush,
NULL,
};
static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL};
static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL};
static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL};
@@ -182,8 +154,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
mbfl_encoding_utf8_docomo_aliases,
mblen_table_utf8,
0,
&vtbl_utf8_docomo_wchar,
&vtbl_wchar_utf8_docomo,
NULL,
NULL,
mb_utf8_docomo_to_wchar,
mb_wchar_to_utf8_docomo,
NULL,
@@ -197,8 +169,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
NULL,
mblen_table_utf8,
0,
&vtbl_utf8_kddi_a_wchar,
&vtbl_wchar_utf8_kddi_a,
NULL,
NULL,
mb_utf8_kddi_a_to_wchar,
mb_wchar_to_utf8_kddi_a,
NULL,
@@ -212,8 +184,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
mbfl_encoding_utf8_kddi_b_aliases,
mblen_table_utf8,
0,
&vtbl_utf8_kddi_b_wchar,
&vtbl_wchar_utf8_kddi_b,
NULL,
NULL,
mb_utf8_kddi_b_to_wchar,
mb_wchar_to_utf8_kddi_b,
NULL,
@@ -227,222 +199,14 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
mbfl_encoding_utf8_sb_aliases,
mblen_table_utf8,
0,
&vtbl_utf8_sb_wchar,
&vtbl_wchar_utf8_sb,
NULL,
NULL,
mb_utf8_sb_to_wchar,
mb_wchar_to_utf8_sb,
NULL,
mb_cut_utf8,
};
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
mbfl_no_encoding_utf8_docomo,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf8_mobile_wchar,
mbfl_filt_conv_utf8_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf8_docomo,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf8_mobile,
mbfl_filt_conv_sjis_mobile_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = {
mbfl_no_encoding_utf8_kddi_a,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf8_mobile_wchar,
mbfl_filt_conv_utf8_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf8_kddi_a,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf8_mobile,
mbfl_filt_conv_sjis_mobile_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = {
mbfl_no_encoding_utf8_kddi_b,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf8_mobile_wchar,
mbfl_filt_conv_utf8_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf8_kddi_b,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf8_mobile,
mbfl_filt_conv_sjis_mobile_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = {
mbfl_no_encoding_utf8_sb,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_utf8_mobile_wchar,
mbfl_filt_conv_utf8_wchar_flush,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_utf8_sb,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_utf8_mobile,
mbfl_filt_conv_sjis_mobile_flush,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
static int mbfl_filt_put_invalid_char(mbfl_convert_filter *filter)
{
filter->status = filter->cache = 0;
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
return 0;
}
static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
{
int s, c1;
retry:
switch (filter->status) {
case 0x00:
if (c < 0x80) {
CK((*filter->output_function)(c, filter->data));
} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
filter->status = 0x10;
filter->cache = c & 0x1f;
} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
filter->status = 0x20;
filter->cache = c & 0xf;
} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
filter->status = 0x30;
filter->cache = c & 0x7;
} else {
CK(mbfl_filt_put_invalid_char(filter));
}
break;
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
if (c >= 0x80 && c <= 0xbf) {
s = (filter->cache<<6) | (c & 0x3f);
filter->status = filter->cache = 0;
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
s = (filter->cache<<6) | (c & 0x3f);
c1 = filter->cache & 0xf;
if ((c >= 0x80 && c <= 0xbf) &&
((c1 == 0x0 && c >= 0xa0) ||
(c1 == 0xd && c < 0xa0) ||
(c1 > 0x0 && c1 != 0xd))) {
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
s = (filter->cache<<6) | (c & 0x3f);
c1 = filter->cache & 0x7;
if ((c >= 0x80 && c <= 0xbf) &&
((c1 == 0x0 && c >= 0x90) ||
(c1 == 0x4 && c < 0x90) ||
(c1 > 0x0 && c1 != 0x4))) {
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
if (c >= 0x80 && c <= 0xbf) {
filter->cache = (filter->cache<<6) | (c & 0x3f);
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
EMPTY_SWITCH_DEFAULT_CASE();
}
return 0;
}
static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status) {
(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
filter->status = 0;
}
if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}
return 0;
}
static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < 0x110000) {
if (c < 0x80) {
CK((*filter->output_function)(c, filter->data));
} else if (c < 0x800) {
CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
} else if (c < 0x10000) {
CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
} else {
CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
}
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
@@ -581,143 +345,6 @@ static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, uns
return zend_string_init_fast((char*)start, _end - start);
}
static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter)
{
int s, s1 = 0, c1 = 0, snd = 0;
retry:
switch (filter->status & 0xff) {
case 0x00:
if (c < 0x80) {
CK((*filter->output_function)(c, filter->data));
} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
filter->status = 0x10;
filter->cache = c & 0x1f;
} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
filter->status = 0x20;
filter->cache = c & 0xf;
} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
filter->status = 0x30;
filter->cache = c & 0x7;
} else {
CK(mbfl_filt_put_invalid_char(filter));
}
break;
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
filter->status = 0;
if (c >= 0x80 && c <= 0xbf) {
s = (filter->cache << 6) | (c & 0x3f);
filter->cache = 0;
if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_conv_r_map_tbl(s, &s1, 4, mbfl_docomo2uni_pua)) {
s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd);
} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_conv_r_map_tbl(s, &s1, 7, mbfl_kddi2uni_pua)) {
s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_conv_r_map_tbl(s, &s1, 8, mbfl_kddi2uni_pua_b)) {
s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_conv_r_map_tbl(s, &s1, 6, mbfl_sb2uni_pua)) {
s = mbfilter_sjis_emoji_sb2unicode(s1, &snd);
}
if (snd > 0) {
CK((*filter->output_function)(snd, filter->data));
}
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
s = (filter->cache << 6) | (c & 0x3f);
c1 = filter->cache & 0xf;
if ((c >= 0x80 && c <= 0xbf) &&
((c1 == 0x0 && c >= 0xa0) ||
(c1 == 0xd && c < 0xa0) ||
(c1 > 0x0 && c1 != 0xd))) {
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
s = (filter->cache << 6) | (c & 0x3f);
c1 = filter->cache & 0x7;
if ((c >= 0x80 && c <= 0xbf) &&
((c1 == 0x0 && c >= 0x90) ||
(c1 == 0x4 && c < 0x90) ||
(c1 > 0x0 && c1 != 0x4))) {
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
if (c >= 0x80 && c <= 0xbf) {
filter->cache = (filter->cache << 6) | (c & 0x3f);
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
goto retry;
}
break;
EMPTY_SWITCH_DEFAULT_CASE();
}
return 0;
}
static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < 0x110000) {
int s1, c1;
if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 4, mbfl_docomo2uni_pua)) ||
(filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 7, mbfl_kddi2uni_pua)) ||
(filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 8, mbfl_kddi2uni_pua_b)) ||
(filter->to->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 6, mbfl_sb2uni_pua))) {
c = c1;
}
if (filter->status) {
return 0;
}
if (c < 0x80) {
CK((*filter->output_function)(c, filter->data));
} else if (c < 0x800) {
CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
} else if (c < 0x10000) {
CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
} else {
CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
}
} else {
CK(mbfl_filt_conv_illegal_output(c, filter));
}
return 0;
}
/* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
* These correspond to the letters A-Z
* To display the flag emoji for a country, two unicode codepoints are combined,

View File

@@ -31,21 +31,9 @@
#define MBFL_MBFILTER_UTF8_H
extern const mbfl_encoding mbfl_encoding_utf8;
extern const struct mbfl_convert_vtbl vtbl_utf8_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf8;
extern const mbfl_encoding mbfl_encoding_utf8_docomo;
extern const mbfl_encoding mbfl_encoding_utf8_kddi_a;
extern const mbfl_encoding mbfl_encoding_utf8_kddi_b;
extern const mbfl_encoding mbfl_encoding_utf8_sb;
extern const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo;
extern const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a;
extern const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b;
extern const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb;
#endif /* MBFL_MBFILTER_UTF8_H */