1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Implement fast text conversion interface for SJIS-mac

This commit is contained in:
Alex Dowad
2022-01-09 19:53:24 +02:00
parent c9479899c6
commit 6cf30356e0
2 changed files with 406 additions and 2 deletions

View File

@@ -39,6 +39,8 @@ extern const unsigned char mblen_table_sjis[];
static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_sjis_mac_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
@@ -51,8 +53,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis_mac_wchar,
&vtbl_wchar_sjis_mac,
NULL,
NULL
mb_sjismac_to_wchar,
mb_wchar_to_sjismac
};
const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
@@ -662,3 +664,402 @@ mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter)
return 0;
}
static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
uint32_t *out = buf, *limit = buf + bufsize;
while (p < e && out < limit) {
unsigned char c = *p++;
if (c < 0x80 && c != 0x5C) {
*out++ = c;
} else if (c >= 0xA1 && c <= 0xDF) {
*out++ = 0xFEC0 + c;
} else if (c > 0x80 && c <= 0xED && c != 0xA0) {
if (p == e) {
*out++ = MBFL_BAD_INPUT;
break;
}
unsigned char c2 = *p++;
if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) {
unsigned int w = 0, s1 = 0, s2 = 0;
SJIS_DECODE(c, c2, s1, s2);
unsigned int s = (s1 - 0x21)*94 + s2 - 0x21;
if (s <= 0x89) {
if (s == 0x1C) {
w = 0x2014; /* EM DASH */
} else if (s == 0x1F) {
w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
} else if (s == 0x20) {
w = 0x301C; /* FULLWIDTH TILDE */
} else if (s == 0x21) {
w = 0x2016; /* PARALLEL TO */
} else if (s == 0x3C) {
w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
} else if (s == 0x50) {
w = 0xA2; /* FULLWIDTH CENT SIGN */
} else if (s == 0x51) {
w = 0xA3; /* FULLWIDTH POUND SIGN */
} else if (s == 0x89) {
w = 0xAC; /* FULLWIDTH NOT SIGN */
}
if (w) {
*out++ = w;
continue;
}
}
for (int i = 0; i < 7; i++) {
if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
*out++ = s - code_tbl[i][0] + code_tbl[i][2];
goto next_iteration;
}
}
for (int i = 0; i < code_tbl_m_len; i++) {
if (s == code_tbl_m[i][0]) {
int n = 5;
if (code_tbl_m[i][1] == 0xF860) {
n = 3;
} else if (code_tbl_m[i][1] == 0xF861) {
n = 4;
}
if ((limit - out) < n) {
p -= 2;
goto finished;
}
for (int j = 1; j <= n; j++) {
*out++ = code_tbl_m[i][j];
}
goto next_iteration;
}
}
for (int i = 0; i < 8; i++) {
if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
w = code_map[i][s - code_ofst_tbl[i][0]];
if (!w) {
*out++ = MBFL_BAD_INPUT;
goto next_iteration;
}
if ((limit - out) < 2) {
p -= 2;
goto finished;
}
*out++ = w;
if (s >= 0x43E && s <= 0x441) {
*out++ = 0xF87A;
} else if (s == 0x3B1 || s == 0x3B7) {
*out++ = 0xF87F;
} else if (s == 0x4B8 || s == 0x4B9 || s == 0x4C4) {
*out++ = 0x20DD;
} else if (s == 0x1ED9 || s == 0x1EDA || s == 0x1EE8 || s == 0x1EF3 || (s >= 0x1EF5 && s <= 0x1EFB) || s == 0x1F05 || s == 0x1F06 || s == 0x1F18 || (s >= 0x1FF2 && s <= 0x20A5)) {
*out++ = 0xF87E;
}
goto next_iteration;
}
}
if (s < jisx0208_ucs_table_size) {
w = jisx0208_ucs_table[s];
}
if (!w)
w = MBFL_BAD_INPUT;
*out++ = w;
} else {
*out++ = MBFL_BAD_INPUT;
}
} else if (c == 0x5C) {
*out++ = 0xA5;
} else if (c == 0x80) {
*out++ = 0x5C;
} else if (c == 0xA0) {
*out++ = 0xA0;
} else if (c == 0xFD) {
*out++ = 0xA9;
} else if (c == 0xFE) {
*out++ = 0x2122;
} else if (c == 0xFF) {
if ((limit - out) < 2) {
p--;
break;
}
*out++ = 0x2026;
*out++ = 0xF87F;
} else {
*out++ = MBFL_BAD_INPUT;
}
next_iteration: ;
}
finished:
*in_len = e - p;
*in = p;
return out - buf;
}
static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s)
{
if (w2 == 0xF87A) {
for (int i = 0; i < 4; i++) {
if (w == s_form_tbl[i+34+3+3]) {
*s = s_form_sjis_tbl[i+34+3+3];
return true;
}
}
} else if (w2 == 0x20DD) {
for (int i = 0; i < 3; i++) {
if (w == s_form_tbl[i+34+3]) {
*s = s_form_sjis_tbl[i+34+3];
return true;
}
}
} else if (w2 == 0xF87F) {
for (int i = 0; i < 3; i++) {
if (w == s_form_tbl[i+34]) {
*s = s_form_sjis_tbl[i+34];
return true;
}
}
} else if (w2 == 0xF87E) {
for (int i = 0; i < 34; i++) {
if (w == s_form_tbl[i]) {
*s = s_form_sjis_tbl[i];
return true;
}
}
}
return false;
}
/* For codepoints F860-F862, which are treated specially in MacJapanese */
static int transcoding_hint_cp_width[3] = { 3, 4, 5 };
static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
uint32_t w;
if (buf->state) {
w = buf->state & 0xFFFF;
if (buf->state & 0xFF000000L) {
goto resume_transcoding_hint;
} else {
buf->state = 0;
goto process_codepoint;
}
}
while (len--) {
w = *in++;
process_codepoint: ;
unsigned int s = 0;
if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
if (w == 0x5C) {
s = 0x80;
} else if (w == 0xA9) {
s = 0xFD;
} else {
s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
}
} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
if (w == 0x2122) {
s = 0xFE;
} else if (w == 0x2014) {
s = 0x213D;
} else if (w == 0x2116) {
s = 0x2C1D;
} else {
s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
}
} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
s = ucs_i_jis_table[w - ucs_i_jis_table_min];
} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
s = ucs_r_jis_table[w - ucs_r_jis_table_min];
}
if (w >= 0x2000) {
for (int i = 0; i < s_form_tbl_len; i++) {
if (w == s_form_tbl[i]) {
if (!len) {
if (end) {
s = s_form_sjis_fallback_tbl[i];
if (s) {
MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
} else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
}
} else {
buf->state = w;
}
MB_CONVERT_BUF_STORE(buf, out, limit);
return;
}
uint32_t w2 = *in++;
len--;
if (!process_s_form(w, w2, &s)) {
in--; len++;
for (int i = 0; i < s_form_tbl_len; i++) {
if (w == s_form_tbl[i]) {
s = s_form_sjis_fallback_tbl[i];
break;
}
}
}
if (s <= 0xFF) {
out = mb_convert_buf_add(out, s);
} else {
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
}
goto next_iteration;
}
}
if (w == 0xF860 || w == 0xF861 || w == 0xF862) {
/* Apple 'transcoding hint' codepoints (from private use area) */
if (!len) {
if (end) {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
} else {
buf->state = w;
}
MB_CONVERT_BUF_STORE(buf, out, limit);
return;
}
uint32_t w2 = *in++;
len--;
for (int i = 0; i < code_tbl_m_len; i++) {
if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) {
/* This might be a valid transcoding hint sequence */
int index = 3;
resume_transcoding_hint:
if (buf->state) {
i = buf->state >> 24;
index = (buf->state >> 16) & 0xFF;
buf->state = 0;
}
int expected = transcoding_hint_cp_width[w - 0xF860];
while (index <= expected) {
if (!len) {
if (end) {
for (int j = 1; j < index; j++) {
MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
}
} else {
buf->state = (i << 24) | (index << 16) | (w & 0xFFFF);
}
MB_CONVERT_BUF_STORE(buf, out, limit);
return;
}
w2 = *in++;
len--;
if (w2 != code_tbl_m[i][index]) {
/* Didn't match */
for (int j = 1; j < index; j++) {
MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
}
MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
goto next_iteration;
}
index++;
}
/* Successful match, emit SJIS-mac bytes */
s = code_tbl_m[i][0];
unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2;
SJIS_ENCODE(c1, c2, s1, s2);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
out = mb_convert_buf_add2(out, s1, s2);
goto next_iteration;
}
}
/* No valid transcoding hint sequence found */
in--; len++;
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
continue;
}
}
if (!s) {
if (w == 0xA0) {
s = 0xA0;
} else if (w == 0xA5) { /* YEN SIGN */
/* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
* convert codepoint 0xA5 to halfwidth Yen sign */
s = 0x5C; /* HALFWIDTH YEN SIGN */
} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
s = 0x2140;
} else {
for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) {
if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) {
s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
goto found_kuten_code;
}
}
for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) {
if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) {
s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]];
if (s) {
s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
goto found_kuten_code;
}
}
}
for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) {
if (w == wchar2sjis_mac_wchar_tbl[i][0]) {
s = wchar2sjis_mac_wchar_tbl[i][1];
s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
goto found_kuten_code;
}
}
}
}
found_kuten_code:
if ((!s && w) || s >= 0x8080) {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
} else if (s <= 0xFF) {
out = mb_convert_buf_add(out, s);
} else {
unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
SJIS_ENCODE(c1, c2, s1, s2);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
out = mb_convert_buf_add2(out, s1, s2);
}
next_iteration: ;
}
MB_CONVERT_BUF_STORE(buf, out, limit);
}

View File

@@ -87,6 +87,9 @@ findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0,
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%');
echo "Unicode -> SJIS-mac conversion works on all invalid characters\n";
// Regression test
convertValidString("\x20\x26\x6B\xAA", "\x81\x63\x9F\x6F", "UTF-16BE", "SJIS-mac");
// Test special combining characters for MacJapanese when *not* appearing in
// an expected combination
convertInvalidString("\x20\x10\xF8\x7A", "\x81\x5D%", "UTF-16BE", "SJIS-mac");