mirror of
https://github.com/php/php-src.git
synced 2026-04-22 07:28:09 +02:00
b9cd1cdb4f
The performance gain from this change depends on the text encoding and input string size. For very small strings, other overheads tend to swamp the performance gains to some extent, such that the speedup is less than 2x. For medium-length strings (~100 bytes or so), the speedup is typically around 2.5x. The greatest performance gains are for UTF-8 strings which have already been marked as valid (using the GC flags on the zend_string object); for those, the speedup is more than 10x in many cases. The previous implementation first converted the haystack and needle to wchars, then searched for matches between the two sequences of wchars. Because we use -1 as an error marker when converting to wchars, error markers from invalid byte sequences in the haystack would match error markers from invalid byte sequences in the needle, even if the specific invalid byte sequence was different. I am not sure whether this behavior is really desirable or not, but anyways, this new implementation follows the same behavior so as not to cause BC breaks.
87 lines
3.0 KiB
PHP
87 lines
3.0 KiB
PHP
--TEST--
|
|
mb_substr_count()
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--INI--
|
|
output_handler=
|
|
--FILE--
|
|
<?php
|
|
mb_internal_encoding("EUC-JP");
|
|
|
|
print "== Empty needle should raise an error ==\n";
|
|
try {
|
|
var_dump(mb_substr_count("", ""));
|
|
} catch (\ValueError $e) {
|
|
echo $e->getMessage() . \PHP_EOL;
|
|
}
|
|
try {
|
|
var_dump(mb_substr_count("��", ""));
|
|
} catch (\ValueError $e) {
|
|
echo $e->getMessage() . \PHP_EOL;
|
|
}
|
|
try {
|
|
// Although the needle below contains 3 bytes, it decodes to zero Unicode codepoints
|
|
// So the needle is actually 'empty', although it doesn't appear so
|
|
var_dump(mb_substr_count("abcdef", "\x1B(B", "ISO-2022-JP"));
|
|
} catch (\ValueError $e) {
|
|
echo $e->getMessage() . \PHP_EOL;
|
|
}
|
|
|
|
print "== Return value for empty haystack should always be zero ==\n";
|
|
var_dump(mb_substr_count("", "\xA4\xA2"));
|
|
var_dump(mb_substr_count("", chr(0)));
|
|
|
|
print "== Try searching using various encodings ==\n";
|
|
$a = str_repeat("abcacba", 100);
|
|
var_dump(mb_substr_count($a, "bca"));
|
|
|
|
$a = str_repeat("\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA2\xA4\xA6\xA4\xA4\xA4\xA2", 100);
|
|
$b = "\xA4\xA4\xA4\xA6\xA4\xA2";
|
|
var_dump(mb_substr_count($a, $b));
|
|
|
|
$to_enc = "UTF-8";
|
|
var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
|
|
mb_convert_encoding($b, $to_enc), $to_enc));
|
|
|
|
$to_enc = "Shift_JIS";
|
|
var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
|
|
mb_convert_encoding($b, $to_enc), $to_enc));
|
|
|
|
$a = str_repeat("abcacbabca", 100);
|
|
var_dump(mb_substr_count($a, "bca"));
|
|
|
|
print "== Regression tests ==\n";
|
|
|
|
// The old implementation had a bug; it could only recognize a maximum of one
|
|
// match for each byte that it fed into the decoder, even if feeding in that
|
|
// byte caused two codepoints to be emitted (because the decoder was holding
|
|
// cached data), and both of those codepoints matched a 1-codepoint needle
|
|
// (For this example, two error markers are emitted for the final byte 0xFF)
|
|
echo mb_substr_count("\xef\xff", "\xf8", "UTF-8"), "\n";
|
|
|
|
// Another thing about the old implementation: if a final codepoint was emitted
|
|
// by a decoder flush function, and that codepoint finished a match with the
|
|
// needle, that match would be disregarded and not counted in the returned value
|
|
// (In practice, the only thing emitted from decoder flush functions is an error
|
|
// marker, if the string ended in an illegal state)
|
|
echo mb_substr_count("+", "+", "UTF7-IMAP"), "\n";
|
|
|
|
?>
|
|
--EXPECT--
|
|
== Empty needle should raise an error ==
|
|
mb_substr_count(): Argument #2 ($needle) must not be empty
|
|
mb_substr_count(): Argument #2 ($needle) must not be empty
|
|
mb_substr_count(): Argument #2 ($needle) must not be empty
|
|
== Return value for empty haystack should always be zero ==
|
|
int(0)
|
|
int(0)
|
|
== Try searching using various encodings ==
|
|
int(100)
|
|
int(100)
|
|
int(100)
|
|
int(100)
|
|
int(200)
|
|
== Regression tests ==
|
|
2
|
|
1
|