diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index e56a728ddc8..c356a824a65 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -131,7 +131,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = { static int emit_char_if_valid(int n, mbfl_convert_filter *filter) { - if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) { + if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) { CK((*filter->output_function)(n, filter->data)); } else { n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 9da9f05315f..b8910ff559a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -86,19 +86,11 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf8 = { int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter) { - int w; - w = c & MBFL_WCSGROUP_MASK; - w |= MBFL_WCSGROUP_THROUGH; - filter->status = 0; - filter->cache = 0; - CK((*filter->output_function)(w, filter->data)); + filter->status = filter->cache = 0; + CK((*filter->output_function)((c & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH, filter->data)); return 0; } - -/* - * UTF-8 => wchar - */ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter) { int s, c1; @@ -131,7 +123,8 @@ retry: CK((*filter->output_function)(s, filter->data)); } else { CK(mbfl_filt_put_invalid_char(filter->cache, filter)); - goto retry; + if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) + goto retry; } break; case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */ @@ -146,7 +139,8 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter->cache, filter)); - goto retry; + if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) + goto retry; } break; case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */ @@ -161,7 +155,8 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter->cache, filter)); - goto retry; + if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) + goto retry; } break; case 0x31: /* 4byte code 3rd char: 0x80-0xbf */ @@ -170,7 +165,8 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter->cache, filter)); - goto retry; + if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) + goto retry; } break; default: @@ -183,27 +179,21 @@ retry: int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter) { - int status, cache; + int status = filter->status, cache = filter->cache; - status = filter->status; - cache = filter->cache; + filter->status = filter->cache = 0; - filter->status = 0; - filter->cache = 0; - - if (status != 0) { + if (status) { CK(mbfl_filt_put_invalid_char(cache, filter)); } - if (filter->flush_function != NULL) { + if (filter->flush_function) { (*filter->flush_function)(filter->data); } + return 0; } -/* - * wchar => UTF-8 - */ int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter) { if (c >= 0 && c < 0x110000) { diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt index c0283681573..a9c97cce118 100644 --- a/ext/mbstring/tests/illformed_utf_sequences.phpt +++ b/ext/mbstring/tests/illformed_utf_sequences.phpt @@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0)); var_dump(chk_enc("\x41\x42\x43", 0)); var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6)); var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6)); -var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9)); -var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9)); -var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12)); -var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11)); +var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6)); +var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6)); +var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9)); +var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8)); var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15)); var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15)); var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18)); var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18)); var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0)); -var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9)); -var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12)); +var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6)); +var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9)); var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15)); var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18)); var_dump(chk_enc("\xc1\xbf", 2)); var_dump(chk_enc("\xc2\x80", 0)); var_dump(chk_enc("\xdf\xbf", 0)); -var_dump(chk_enc("\xe0\x9f\xff", 3)); +var_dump(chk_enc("\xe0\x9f\xff", 2)); var_dump(chk_enc("\xe0\xa0\x80", 2)); var_dump(chk_enc("\xef\xbf\xbf", 0)); -var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4)); +var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3)); var_dump(chk_enc("\xf0\x90\x80\x80", 0)); var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4)); var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5)); @@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n"; $out = ''; $cnt = 0; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { - $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3); + $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2); if ($s === false) { $cnt++; } else { diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt new file mode 100644 index 00000000000..ae80958c321 --- /dev/null +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -0,0 +1,1023 @@ +--TEST-- +Torture test for UTF-{7,8,16,32} +--EXTENSIONS-- +mbstring +--SKIPIF-- + +--FILE-- + 0xDFFF) // surrogates; included in UnicodeData.txt + $validCodepoints[pack('N', $cp)] = true; + } +} + +function testValidCodepoints($encoding) { + global $validCodepoints; + + $good = array_keys($validCodepoints); + shuffle($good); + + while (!empty($good)) { + $string = ''; + $length = min(rand(20,30), count($good)); + while ($length--) { + $string .= array_pop($good); + } + + $converted = mb_convert_encoding($string, $encoding, 'UTF-32BE'); + if ($converted === false) + die("mb_convert_encoding failed to convert UTF-32BE to $encoding." . + "\nString: " . bin2hex($string)); + testValidString($converted, $string, $encoding, 'UTF-32BE'); + } +} + +function testInvalidCodepoints($invalid, $encoding) { + global $validCodepoints; + + $good = array_keys($validCodepoints); + shuffle($good); + + foreach ($invalid as $bad => $expected) { + $good1 = array_pop($good); + $string = $bad . mb_convert_encoding($good1, $encoding, 'UTF-32BE'); + testInvalidString($string, $expected . $good1, $encoding, 'UTF-32BE'); + } +} + +echo "== UTF-8 ==\n"; + +testValidCodepoints('UTF-8'); + +testValidString('', '', 'UTF-8', 'UTF-32BE'); + +$invalid = array( + // Codepoints outside of valid 0-0x10FFFF range for Unicode + "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000 + "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 + "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF + + // Reserved range for UTF-16 surrogate pairs + "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800 + "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF + "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF + + // Truncated characters + "\xDF" => "\x00\x00\x00%", // should have been 2-byte + "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte + "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte + + // Multi-byte characters which end too soon and go to ASCII + "\xDFA" => "\x00\x00\x00%\x00\x00\x00A", + "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", + "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", + + // Multi-byte characters which end too soon and go to another MB char + "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", + "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", + "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", + + // Continuation bytes which appear outside of a MB char + "\x80" => "\x00\x00\x00%", + "A\x80" => "\x00\x00\x00A\x00\x00\x00%", + "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%", + + // Overlong code units + // (Using more bytes than needed to encode a character) + "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes + "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes + "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes +); + +testInvalidCodepoints($invalid, 'UTF-8'); + +echo "== UTF-16 ==\n"; + +testValidCodepoints("UTF-16"); +testValidCodepoints("UTF-16LE"); +testValidCodepoints("UTF-16BE"); + +testValidString('', '', 'UTF-16', 'UTF-32BE'); +testValidString('', '', 'UTF-16LE', 'UTF-32BE'); +testValidString('', '', 'UTF-16BE', 'UTF-32BE'); + +$invalid = array( + // UTF-16 _cannot_ represent codepoints bigger than 0x10FFFF, so we're not + // worried about that. But there are plenty of other ways to mess up... + + // Second half of surrogate pair comes first + "\xDC\x01\xD8\x02" => "\x00\x00\x00%\x00\x00\x00%", + + // First half of surrogate pair not followed by second part + "\xD8\x01\x00A" => "\x00\x00\x00%\x00\x00\x00A", + + // First half of surrogate pair at end of string + "\xD8\x01" => "\x00\x00\x00%", +); + +testInvalidCodepoints($invalid, 'UTF-16'); +testInvalidCodepoints($invalid, 'UTF-16BE'); + +// Truncated strings +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16', 'UTF-32BE'); +testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16', 'UTF-32BE'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE'); +testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE'); + +$invalid = array( + // Second half of surrogate pair comes first + "\x01\xDC\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%", + + // First half of surrogate pair not followed by second part + "\x01\xD8A\x00" => "\x00\x00\x00%\x00\x00\x00A", + + // First half of surrogate pair at end of string + "\x01\xD8" => "\x00\x00\x00%", +); + +testInvalidCodepoints($invalid, 'UTF-16LE'); + +// Truncated +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE'); +testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE'); + +// TODO: test handling of UTF-16 BOM + +echo "== UTF-32 ==\n"; + +testValidCodepoints("UTF-32LE"); +testValidCodepoints("UTF-32BE"); + +// Empty string +testValidString('', '', 'UTF-32', 'UTF-32BE'); +testValidString('', '', 'UTF-32BE', 'UTF-32'); +testValidString('', '', 'UTF-32LE', 'UTF-32BE'); + +$invalid = array( + // Codepoints which are too big + "\x00\x11\x00\x00" => "\x00\x00\x00%", + "\x80\x00\x00\x00" => "\x00\x00\x00%", + "\xff\xff\xfe\xff" => "\x00\x00\x00%", + + // Surrogates + "\x00\x00\xd8\x00" => "\x00\x00\x00%", + "\x00\x00\xdb\xff" => "\x00\x00\x00%", + "\x00\x00\xdc\x00" => "\x00\x00\x00%", + "\x00\x00\xdf\xff" => "\x00\x00\x00%", +); + +testInvalidCodepoints($invalid, 'UTF-32'); +testInvalidCodepoints($invalid, 'UTF-32BE'); + +// Truncated code units +testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); +testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); + +$invalid = array( + // Codepoints which are too big + "\x00\x00\x11\x00" => "\x00\x00\x00%", + "\x00\x00\x00\x80" => "\x00\x00\x00%", + "\xff\xfe\xff\xff" => "\x00\x00\x00%", + + // Surrogates + "\x00\xd8\x00\x00" => "\x00\x00\x00%", + "\xff\xdb\x00\x00" => "\x00\x00\x00%", + "\x00\xdc\x00\x00" => "\x00\x00\x00%", + "\xff\xdf\x00\x00" => "\x00\x00\x00%", +); + +testInvalidCodepoints($invalid, 'UTF-32LE'); + +// Truncated code units +testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); +testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); +testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); + +// TODO: test handling of UTF-32 BOM + +echo "== UTF-7 ==\n"; + +testValidString('', '', 'UTF-7', 'UTF-32BE'); + +// 'Direct' characters +foreach (range(ord('A'), ord('Z')) as $byte) + testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); +foreach (range(ord('a'), ord('z')) as $byte) + testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); +foreach (range(ord('0'), ord('9')) as $byte) + testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); +foreach (str_split("'(),-./:?") as $char) + testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE'); + +// 'Optional direct' characters are Base64-encoded in mbstring's implementation + +// Whitespace +foreach (str_split(" \t\r\n\x00") as $char) + testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE'); + +// Encoding + as +- +testValidString('+-', "\x00\x00\x00+", 'UTF-7', 'UTF-32BE', false); + +// UTF-16 + Base64 encoding +function encode($str, $encoding) { + // Base64 encoding for UTF-7 doesn't use '=' for padding + return str_replace('=', '', base64_encode(mb_convert_encoding($str, 'UTF-16BE', $encoding))); +} + +for ($i = 0; $i < 256; $i++) { + $reversible = true; + if ($i >= ord('A') && $i <= ord('Z')) + $reversible = false; + if ($i >= ord('a') && $i <= ord('z')) + $reversible = false; + if ($i >= ord('0') && $i <= ord('9')) + $reversible = false; + if (strpos("'(),-./:?\x00 \t\r\n", chr($i)) !== false) + $reversible = false; + + testValidString('+' . encode("\x00" . chr($i), 'UTF-16BE') . '-', "\x00\x00\x00" . chr($i), 'UTF-7', 'UTF-32BE', $reversible); +} + +testValidString('+' . encode("\x12\x34", 'UTF-16BE') . '-', "\x00\x00\x12\x34", 'UTF-7', 'UTF-32BE'); +testValidString('+' . encode("\x12\x34\x56\x78", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78", 'UTF-7', 'UTF-32BE'); +testValidString('+' . encode("\x12\x34\x56\x78\x00\x40", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78\x00\x00\x00\x40", 'UTF-7', 'UTF-32BE'); + +// Surrogate pair +testValidString('+' . encode("\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE'); +testValidString('+' . encode("\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-32BE') . '-', "\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-7', 'UTF-32BE', false); +testValidString('+' . encode("\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE'); + +// Unterminated + section +// (This is not considered illegal) +testValidString('+' . encode('ABC', 'ASCII'), "\x00A\x00B\x00C", 'UTF-7', 'UTF-16BE', false); + +// + sections immediately after each other +// (This isn't illegal either) +testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00C\x00D", 'UTF-7', 'UTF-16BE', false); + +// + sections not immediately after each other +// (Just trying to be exhaustive here) +testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); + +// + section terminated by a non-Base64 ASCII character which is NOT - +for ($i = 0; $i < 128; $i++) { + if ($i >= ord('A') && $i <= ord('Z')) + continue; + if ($i >= ord('a') && $i <= ord('z')) + continue; + if ($i >= ord('0') && $i <= ord('9')) + continue; + if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~')) + continue; + $char = chr($i); + testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); +} + +// Now let's see how UTF-7 can go BAD... + +function rawEncode($str) { + return str_replace('=', '', base64_encode($str)); +} + +// First, messed up UTF16 in + section +// Second half of surrogate pair coming first +testInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); + +// First half of surrogate pair not followed by second half +testInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE'); +testInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); + +// First half of surrogate pair appearing at end of string +testInvalidString('+' . rawEncode("\xD8\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); + +// Truncated string +testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); + +// And then, messed up Base64 encoding + +// Bad padding on + section (not zeroes) +$encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding... +$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1); +testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); + +echo "Done!\n"; + +?> +--EXPECT-- +== UTF-8 == +== UTF-16 == +== UTF-32 == +== UTF-7 == +Done!