mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Speed boost for mb_stripos (when not using UTF-8)
Instead of case-folding a string and then converting it to UTF-8 as a separate operation, why not convert it to UTF-8 at the same time as we fold case? For non-UTF-8 encodings, this typically makes mb_stripos about 2x faster.
This commit is contained in:
@@ -2878,7 +2878,7 @@ PHP_FUNCTION(mb_convert_encoding)
|
||||
|
||||
static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
|
||||
{
|
||||
return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
|
||||
return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
|
||||
}
|
||||
|
||||
PHP_FUNCTION(mb_convert_case)
|
||||
@@ -4858,10 +4858,10 @@ MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string
|
||||
{
|
||||
/* We're using simple case-folding here, because we'd have to deal with remapping of
|
||||
* offsets otherwise. */
|
||||
zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);
|
||||
zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);
|
||||
zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
|
||||
zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
|
||||
|
||||
size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);
|
||||
size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
|
||||
|
||||
zend_string_free(haystack_conv);
|
||||
zend_string_free(needle_conv);
|
||||
|
||||
@@ -238,7 +238,7 @@ static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
|
||||
return out;
|
||||
}
|
||||
|
||||
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar)
|
||||
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
|
||||
{
|
||||
/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
|
||||
* See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
|
||||
@@ -363,7 +363,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
|
||||
}
|
||||
|
||||
ZEND_ASSERT(p - converted_buf <= 192);
|
||||
src_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
|
||||
dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
|
||||
}
|
||||
|
||||
return mb_convert_buf_result(&buf);
|
||||
|
||||
@@ -91,7 +91,7 @@ typedef enum {
|
||||
|
||||
MBSTRING_API zend_string *php_unicode_convert_case(
|
||||
php_case_mode case_mode, const char *srcstr, size_t srclen,
|
||||
const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar);
|
||||
const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar);
|
||||
|
||||
/* Optimize the common ASCII case for lower/upper */
|
||||
|
||||
|
||||
@@ -9,9 +9,8 @@ mbstring
|
||||
ini_set('include_path','.');
|
||||
include_once('common.inc');
|
||||
|
||||
|
||||
// Test string
|
||||
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
|
||||
$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA30123\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
|
||||
|
||||
$slen = mb_strlen($euc_jp, 'EUC-JP');
|
||||
echo "String len: $slen\n";
|
||||
@@ -21,11 +20,11 @@ mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
|
||||
|
||||
echo "== POSITIVE OFFSET ==\n";
|
||||
|
||||
print mb_stripos($euc_jp, '日本語', 0, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, '0', 0, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 3, 0, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 0, 0, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, '日本語', 15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, '0', 15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 3, 15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
|
||||
@@ -34,7 +33,7 @@ print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
|
||||
// Negative offset
|
||||
echo "== NEGATIVE OFFSET ==\n";
|
||||
|
||||
print mb_stripos($euc_jp, '日本語', -15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, '0', -15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 3, -15, 'EUC-JP') . "\n";
|
||||
print mb_stripos($euc_jp, 0, -15, 'EUC-JP') . "\n";
|
||||
@@ -44,7 +43,7 @@ print mb_stripos($euc_jp, 0, -43, 'EUC-JP') . "\n";
|
||||
// Out of range - should return false
|
||||
print ("== OUT OF RANGE ==\n");
|
||||
|
||||
$r = mb_stripos($euc_jp, '日本語', 40, 'EUC-JP');
|
||||
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 40, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
$r = mb_stripos($euc_jp, '0', 40, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
@@ -52,7 +51,7 @@ $r = mb_stripos($euc_jp, 3, 40, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
$r = mb_stripos($euc_jp, 0, 40, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
$r = mb_stripos($euc_jp, '日本語', -3, 'EUC-JP');
|
||||
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -3, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
$r = mb_stripos($euc_jp, '0', -3, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
|
||||
@@ -65,7 +64,7 @@ $r = mb_stripos($euc_jp, 0, -3, 'EUC-JP');
|
||||
// Non-existent
|
||||
echo "== NON-EXISTENT ==\n";
|
||||
|
||||
$r = mb_stripos($euc_jp, '韓国語', 0, 'EUC-JP');
|
||||
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
|
||||
$r = mb_stripos($euc_jp, "\n", 0, 'EUC-JP');
|
||||
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
|
||||
@@ -76,12 +75,12 @@ echo "== NO ENCODING PARAMETER ==\n";
|
||||
|
||||
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
|
||||
|
||||
print mb_stripos($euc_jp, '日本語', 0) . "\n";
|
||||
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0) . "\n";
|
||||
print mb_stripos($euc_jp, '0', 0) . "\n";
|
||||
print mb_stripos($euc_jp, 3, 0) . "\n";
|
||||
print mb_stripos($euc_jp, 0, 0) . "\n";
|
||||
|
||||
$r = mb_stripos($euc_jp, '韓国語', 0);
|
||||
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0);
|
||||
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
|
||||
$r = mb_stripos($euc_jp, "\n", 0);
|
||||
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
|
||||
@@ -91,12 +90,12 @@ echo "== NO OFFSET AND ENCODING PARAMETER ==\n";
|
||||
|
||||
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
|
||||
|
||||
print mb_stripos($euc_jp, '日本語') . "\n";
|
||||
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC") . "\n";
|
||||
print mb_stripos($euc_jp, '0') . "\n";
|
||||
print mb_stripos($euc_jp, 3) . "\n";
|
||||
print mb_stripos($euc_jp, 0) . "\n";
|
||||
|
||||
$r = mb_stripos($euc_jp, '韓国語');
|
||||
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC");
|
||||
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
|
||||
$r = mb_stripos($euc_jp, "\n");
|
||||
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
|
||||
|
||||
Reference in New Issue
Block a user