1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Speed boost for mb_stripos (when not using UTF-8)

Instead of case-folding a string and then converting it to UTF-8 as a
separate operation, why not convert it to UTF-8 at the same time as
we fold case?

For non-UTF-8 encodings, this typically makes mb_stripos about 2x
faster.
This commit is contained in:
Alex Dowad
2022-12-14 14:11:10 +02:00
parent e288438373
commit 744ca16e73
4 changed files with 18 additions and 19 deletions

View File

@@ -2878,7 +2878,7 @@ PHP_FUNCTION(mb_convert_encoding)
static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
{
return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
}
PHP_FUNCTION(mb_convert_case)
@@ -4858,10 +4858,10 @@ MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string
{
/* We're using simple case-folding here, because we'd have to deal with remapping of
* offsets otherwise. */
zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);
zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);
zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);
size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
zend_string_free(haystack_conv);
zend_string_free(needle_conv);

View File

@@ -238,7 +238,7 @@ static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
return out;
}
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar)
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
{
/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
* See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
@@ -363,7 +363,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
}
ZEND_ASSERT(p - converted_buf <= 192);
src_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);

View File

@@ -91,7 +91,7 @@ typedef enum {
MBSTRING_API zend_string *php_unicode_convert_case(
php_case_mode case_mode, const char *srcstr, size_t srclen,
const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar);
const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar);
/* Optimize the common ASCII case for lower/upper */

View File

@@ -9,9 +9,8 @@ mbstring
ini_set('include_path','.');
include_once('common.inc');
// Test string
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA30123\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
$slen = mb_strlen($euc_jp, 'EUC-JP');
echo "String len: $slen\n";
@@ -21,11 +20,11 @@ mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
echo "== POSITIVE OFFSET ==\n";
print mb_stripos($euc_jp, '日本語', 0, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, '0', 0, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 3, 0, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 0, 0, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, '日本語', 15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, '0', 15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 3, 15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
@@ -34,7 +33,7 @@ print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
// Negative offset
echo "== NEGATIVE OFFSET ==\n";
print mb_stripos($euc_jp, '日本語', -15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, '0', -15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 3, -15, 'EUC-JP') . "\n";
print mb_stripos($euc_jp, 0, -15, 'EUC-JP') . "\n";
@@ -44,7 +43,7 @@ print mb_stripos($euc_jp, 0, -43, 'EUC-JP') . "\n";
// Out of range - should return false
print ("== OUT OF RANGE ==\n");
$r = mb_stripos($euc_jp, '日本語', 40, 'EUC-JP');
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 40, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
$r = mb_stripos($euc_jp, '0', 40, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
@@ -52,7 +51,7 @@ $r = mb_stripos($euc_jp, 3, 40, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
$r = mb_stripos($euc_jp, 0, 40, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
$r = mb_stripos($euc_jp, '日本語', -3, 'EUC-JP');
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -3, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
$r = mb_stripos($euc_jp, '0', -3, 'EUC-JP');
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
@@ -65,7 +64,7 @@ $r = mb_stripos($euc_jp, 0, -3, 'EUC-JP');
// Non-existent
echo "== NON-EXISTENT ==\n";
$r = mb_stripos($euc_jp, '韓国語', 0, 'EUC-JP');
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0, 'EUC-JP');
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
$r = mb_stripos($euc_jp, "\n", 0, 'EUC-JP');
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -76,12 +75,12 @@ echo "== NO ENCODING PARAMETER ==\n";
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
print mb_stripos($euc_jp, '日本語', 0) . "\n";
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0) . "\n";
print mb_stripos($euc_jp, '0', 0) . "\n";
print mb_stripos($euc_jp, 3, 0) . "\n";
print mb_stripos($euc_jp, 0, 0) . "\n";
$r = mb_stripos($euc_jp, '韓国語', 0);
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0);
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
$r = mb_stripos($euc_jp, "\n", 0);
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -91,12 +90,12 @@ echo "== NO OFFSET AND ENCODING PARAMETER ==\n";
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
print mb_stripos($euc_jp, '日本語') . "\n";
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC") . "\n";
print mb_stripos($euc_jp, '0') . "\n";
print mb_stripos($euc_jp, 3) . "\n";
print mb_stripos($euc_jp, 0) . "\n";
$r = mb_stripos($euc_jp, '韓国語');
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC");
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
$r = mb_stripos($euc_jp, "\n");
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";