diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index ff5cfb43086..f9e64c32589 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -847,11 +847,27 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b MB_CONVERT_BUF_ENSURE(buf, out, limit, len); bool consumed = false; + uint32_t w; + + if (buf->state & 0xFFFF00) { + /* Reprocess cached codepoint */ + w = buf->state >> 8; + buf->state &= 0xFF; + goto reprocess_codepoint; + } while (len--) { - uint32_t w = *in++; + w = *in++; +reprocess_codepoint: - w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); + if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) { + /* This codepoint may need to combine with the next one, + * but the 'next one' will come in a separate buffer */ + buf->state |= w << 8; + break; + } else { + w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); + } if (consumed) { /* Two successive codepoints were converted into one */ diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt index 04a9c015536..f8e5831ebe2 100644 --- a/ext/mbstring/tests/cp5022x_encoding.phpt +++ b/ext/mbstring/tests/cp5022x_encoding.phpt @@ -382,6 +382,14 @@ $converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE'); if ($converted !== '?&') die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")"); +// In CP50220, two codepoints can be collapsed into a single kuten code in some cases +// This should work even on a boundary between separately processed buffers +$shouldCollapse = "\xFF\x76\xFF\x9E"; +$expected = "\x1B\$B%,\x1B(B"; +for ($i = 0; $i < 256; $i++) { + convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false); +} + ?> --EXPECT-- ASCII support OK