diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 97fbb59e505..5163db67b3c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -39,6 +39,8 @@ extern const unsigned char mblen_table_sjis[]; static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_sjis_mac_wchar_flush(mbfl_convert_filter *filter); +static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL}; @@ -51,8 +53,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, - NULL, - NULL + mb_sjismac_to_wchar, + mb_wchar_to_sjismac }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { @@ -662,3 +664,402 @@ mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c < 0x80 && c != 0x5C) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c <= 0xED && c != 0xA0) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) { + unsigned int w = 0, s1 = 0, s2 = 0; + SJIS_DECODE(c, c2, s1, s2); + unsigned int s = (s1 - 0x21)*94 + s2 - 0x21; + + if (s <= 0x89) { + if (s == 0x1C) { + w = 0x2014; /* EM DASH */ + } else if (s == 0x1F) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 0x20) { + w = 0x301C; /* FULLWIDTH TILDE */ + } else if (s == 0x21) { + w = 0x2016; /* PARALLEL TO */ + } else if (s == 0x3C) { + w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 0x50) { + w = 0xA2; /* FULLWIDTH CENT SIGN */ + } else if (s == 0x51) { + w = 0xA3; /* FULLWIDTH POUND SIGN */ + } else if (s == 0x89) { + w = 0xAC; /* FULLWIDTH NOT SIGN */ + } + if (w) { + *out++ = w; + continue; + } + } + + for (int i = 0; i < 7; i++) { + if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) { + *out++ = s - code_tbl[i][0] + code_tbl[i][2]; + goto next_iteration; + } + } + + for (int i = 0; i < code_tbl_m_len; i++) { + if (s == code_tbl_m[i][0]) { + int n = 5; + if (code_tbl_m[i][1] == 0xF860) { + n = 3; + } else if (code_tbl_m[i][1] == 0xF861) { + n = 4; + } + if ((limit - out) < n) { + p -= 2; + goto finished; + } + for (int j = 1; j <= n; j++) { + *out++ = code_tbl_m[i][j]; + } + goto next_iteration; + } + } + + for (int i = 0; i < 8; i++) { + if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) { + w = code_map[i][s - code_ofst_tbl[i][0]]; + if (!w) { + *out++ = MBFL_BAD_INPUT; + goto next_iteration; + } + if ((limit - out) < 2) { + p -= 2; + goto finished; + } + *out++ = w; + if (s >= 0x43E && s <= 0x441) { + *out++ = 0xF87A; + } else if (s == 0x3B1 || s == 0x3B7) { + *out++ = 0xF87F; + } else if (s == 0x4B8 || s == 0x4B9 || s == 0x4C4) { + *out++ = 0x20DD; + } else if (s == 0x1ED9 || s == 0x1EDA || s == 0x1EE8 || s == 0x1EF3 || (s >= 0x1EF5 && s <= 0x1EFB) || s == 0x1F05 || s == 0x1F06 || s == 0x1F18 || (s >= 0x1FF2 && s <= 0x20A5)) { + *out++ = 0xF87E; + } + goto next_iteration; + } + } + + if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } + + if (!w) + w = MBFL_BAD_INPUT; + *out++ = w; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c == 0x5C) { + *out++ = 0xA5; + } else if (c == 0x80) { + *out++ = 0x5C; + } else if (c == 0xA0) { + *out++ = 0xA0; + } else if (c == 0xFD) { + *out++ = 0xA9; + } else if (c == 0xFE) { + *out++ = 0x2122; + } else if (c == 0xFF) { + if ((limit - out) < 2) { + p--; + break; + } + *out++ = 0x2026; + *out++ = 0xF87F; + } else { + *out++ = MBFL_BAD_INPUT; + } +next_iteration: ; + } + +finished: + *in_len = e - p; + *in = p; + return out - buf; +} + +static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s) +{ + if (w2 == 0xF87A) { + for (int i = 0; i < 4; i++) { + if (w == s_form_tbl[i+34+3+3]) { + *s = s_form_sjis_tbl[i+34+3+3]; + return true; + } + } + } else if (w2 == 0x20DD) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34+3]) { + *s = s_form_sjis_tbl[i+34+3]; + return true; + } + } + } else if (w2 == 0xF87F) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34]) { + *s = s_form_sjis_tbl[i+34]; + return true; + } + } + } else if (w2 == 0xF87E) { + for (int i = 0; i < 34; i++) { + if (w == s_form_tbl[i]) { + *s = s_form_sjis_tbl[i]; + return true; + } + } + } + + return false; +} + +/* For codepoints F860-F862, which are treated specially in MacJapanese */ +static int transcoding_hint_cp_width[3] = { 3, 4, 5 }; + +static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + + if (buf->state) { + w = buf->state & 0xFFFF; + if (buf->state & 0xFF000000L) { + goto resume_transcoding_hint; + } else { + buf->state = 0; + goto process_codepoint; + } + } + + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + if (w == 0x5C) { + s = 0x80; + } else if (w == 0xA9) { + s = 0xFD; + } else { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + if (w == 0x2122) { + s = 0xFE; + } else if (w == 0x2014) { + s = 0x213D; + } else if (w == 0x2116) { + s = 0x2C1D; + } else { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } + + if (w >= 0x2000) { + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + if (!len) { + if (end) { + s = s_form_sjis_fallback_tbl[i]; + if (s) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } + } else { + buf->state = w; + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + uint32_t w2 = *in++; + len--; + + if (!process_s_form(w, w2, &s)) { + in--; len++; + + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + s = s_form_sjis_fallback_tbl[i]; + break; + } + } + } + + if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } + + goto next_iteration; + } + } + + if (w == 0xF860 || w == 0xF861 || w == 0xF862) { + /* Apple 'transcoding hint' codepoints (from private use area) */ + if (!len) { + if (end) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } else { + buf->state = w; + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + + uint32_t w2 = *in++; + len--; + + for (int i = 0; i < code_tbl_m_len; i++) { + if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) { + /* This might be a valid transcoding hint sequence */ + int index = 3; + +resume_transcoding_hint: + if (buf->state) { + i = buf->state >> 24; + index = (buf->state >> 16) & 0xFF; + buf->state = 0; + } + + int expected = transcoding_hint_cp_width[w - 0xF860]; + + while (index <= expected) { + if (!len) { + if (end) { + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + } else { + buf->state = (i << 24) | (index << 16) | (w & 0xFFFF); + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + + w2 = *in++; + len--; + + if (w2 != code_tbl_m[i][index]) { + /* Didn't match */ + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + goto next_iteration; + } + + index++; + } + + /* Successful match, emit SJIS-mac bytes */ + s = code_tbl_m[i][0]; + unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + goto next_iteration; + } + } + + /* No valid transcoding hint sequence found */ + in--; len++; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } + } + + if (!s) { + if (w == 0xA0) { + s = 0xA0; + } else if (w == 0xA5) { /* YEN SIGN */ + /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign; + * convert codepoint 0xA5 to halfwidth Yen sign */ + s = 0x5C; /* HALFWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else { + for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) { + if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) { + s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + + for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) { + if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) { + s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]]; + if (s) { + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + } + + for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) { + if (w == wchar2sjis_mac_wchar_tbl[i][0]) { + s = wchar2sjis_mac_wchar_tbl[i][1]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + } + } + +found_kuten_code: + if ((!s && w) || s >= 0x8080) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + +next_iteration: ; + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt index 5d1c2de869e..8013d82e51b 100644 --- a/ext/mbstring/tests/sjismac_encoding.phpt +++ b/ext/mbstring/tests/sjismac_encoding.phpt @@ -87,6 +87,9 @@ findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%'); echo "Unicode -> SJIS-mac conversion works on all invalid characters\n"; +// Regression test +convertValidString("\x20\x26\x6B\xAA", "\x81\x63\x9F\x6F", "UTF-16BE", "SJIS-mac"); + // Test special combining characters for MacJapanese when *not* appearing in // an expected combination convertInvalidString("\x20\x10\xF8\x7A", "\x81\x5D%", "UTF-16BE", "SJIS-mac");