1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Optimize mb_strcut for fixed-byte-length text encodings

On microbenchmarks run on my dev machine, mb_strcut is now ~50% faster
for fixed-byte-length text encodings like ASCII. (This is because the
previous code did an extra, unnecessary copy operation on the
resulting output string.)
This commit is contained in:
Alex Dowad
2023-11-29 19:47:14 +02:00
parent 58fc521713
commit 5f1477d144
2 changed files with 61 additions and 5 deletions

View File

@@ -2443,12 +2443,22 @@ PHP_FUNCTION(mb_strcut)
if (enc->cut) {
RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
} else {
ret = mbfl_strcut(&string, &result, from, len);
ZEND_ASSERT(ret != NULL);
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
}
unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
if (char_len) {
/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
from &= -char_len;
if (len > string.len - from) {
len = string.len - from;
}
RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
}
ret = mbfl_strcut(&string, &result, from, len);
ZEND_ASSERT(ret != NULL);
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
}
/* }}} */

View File

@@ -225,6 +225,33 @@ print "== UHC ==\n";
print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n";
print "== ASCII ==\n";
print "Empty: [" . bin2hex(mb_strcut("ABC", 0, 0, "ASCII")) . "]\n";
print "Empty: [" . bin2hex(mb_strcut("ABC", 1, 0, "ASCII")) . "]\n";
print "Empty: [" . bin2hex(mb_strcut("ABC", 2, 0, "ASCII")) . "]\n";
print "One char: [" . bin2hex(mb_strcut("ABC", 2, 1, "ASCII")) . "]\n";
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 2, "ASCII")) . "]\n";
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 3, "ASCII")) . "]\n";
print "== UCS-2BE ==\n";
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 0, 0, "UCS-2BE")) . "]\n";
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 0, "UCS-2BE")) . "]\n";
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 0, "UCS-2BE")) . "]\n";
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 1, "UCS-2BE")) . "]\n";
print "One char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 2, "UCS-2BE")) . "]\n";
print "Cut in middle of following char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 3, "UCS-2BE")) . "]\n";
print "Two chars: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 4, "UCS-2BE")) . "]\n";
print "== UCS-4BE ==\n";
print "From 1, Length 5: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 5, "UCS-4BE")) . "]\n";
print "From 1, Length 6: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 6, "UCS-4BE")) . "]\n";
print "From 1, Length 8: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 8, "UCS-4BE")) . "]\n";
?>
--EXPECT--
== EUC-JP ==
@@ -382,3 +409,22 @@ Invalid byte 0xF5: []
Double-byte char: []
== UHC ==
Single byte 0x96: [96]
== ASCII ==
Empty: []
Empty: []
Empty: []
One char: [43]
Two chars: [4243]
Two chars: [4243]
== UCS-2BE ==
Empty: []
Empty: []
Empty: []
Empty: []
One char: [0041]
Cut in middle of following char: [0041]
Two chars: [00410042]
== UCS-4BE ==
From 1, Length 5: [00000041]
From 1, Length 6: [00000041]
From 1, Length 8: [0000004100000042]