mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Optimize mb_strcut for fixed-byte-length text encodings
On microbenchmarks run on my dev machine, mb_strcut is now ~50% faster for fixed-byte-length text encodings like ASCII. (This is because the previous code did an extra, unnecessary copy operation on the resulting output string.)
This commit is contained in:
@@ -2443,12 +2443,22 @@ PHP_FUNCTION(mb_strcut)
|
||||
|
||||
if (enc->cut) {
|
||||
RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
|
||||
} else {
|
||||
ret = mbfl_strcut(&string, &result, from, len);
|
||||
ZEND_ASSERT(ret != NULL);
|
||||
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
|
||||
efree(ret->val);
|
||||
}
|
||||
|
||||
unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
|
||||
if (char_len) {
|
||||
/* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
|
||||
from &= -char_len;
|
||||
if (len > string.len - from) {
|
||||
len = string.len - from;
|
||||
}
|
||||
RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
|
||||
}
|
||||
|
||||
ret = mbfl_strcut(&string, &result, from, len);
|
||||
ZEND_ASSERT(ret != NULL);
|
||||
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
|
||||
efree(ret->val);
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
|
||||
@@ -225,6 +225,33 @@ print "== UHC ==\n";
|
||||
|
||||
print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n";
|
||||
|
||||
print "== ASCII ==\n";
|
||||
|
||||
print "Empty: [" . bin2hex(mb_strcut("ABC", 0, 0, "ASCII")) . "]\n";
|
||||
print "Empty: [" . bin2hex(mb_strcut("ABC", 1, 0, "ASCII")) . "]\n";
|
||||
print "Empty: [" . bin2hex(mb_strcut("ABC", 2, 0, "ASCII")) . "]\n";
|
||||
|
||||
print "One char: [" . bin2hex(mb_strcut("ABC", 2, 1, "ASCII")) . "]\n";
|
||||
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 2, "ASCII")) . "]\n";
|
||||
print "Two chars: [" . bin2hex(mb_strcut("ABC", 1, 3, "ASCII")) . "]\n";
|
||||
|
||||
print "== UCS-2BE ==\n";
|
||||
|
||||
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 0, 0, "UCS-2BE")) . "]\n";
|
||||
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 0, "UCS-2BE")) . "]\n";
|
||||
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 0, "UCS-2BE")) . "]\n";
|
||||
|
||||
print "Empty: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 2, 1, "UCS-2BE")) . "]\n";
|
||||
print "One char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 2, "UCS-2BE")) . "]\n";
|
||||
print "Cut in middle of following char: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 3, "UCS-2BE")) . "]\n";
|
||||
print "Two chars: [" . bin2hex(mb_strcut("\x00A\x00B\x00C", 1, 4, "UCS-2BE")) . "]\n";
|
||||
|
||||
print "== UCS-4BE ==\n";
|
||||
|
||||
print "From 1, Length 5: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 5, "UCS-4BE")) . "]\n";
|
||||
print "From 1, Length 6: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 6, "UCS-4BE")) . "]\n";
|
||||
print "From 1, Length 8: [" . bin2hex(mb_strcut("\x00\x00\x00\x41\x00\x00\x00\x42", 1, 8, "UCS-4BE")) . "]\n";
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
== EUC-JP ==
|
||||
@@ -382,3 +409,22 @@ Invalid byte 0xF5: []
|
||||
Double-byte char: []
|
||||
== UHC ==
|
||||
Single byte 0x96: [96]
|
||||
== ASCII ==
|
||||
Empty: []
|
||||
Empty: []
|
||||
Empty: []
|
||||
One char: [43]
|
||||
Two chars: [4243]
|
||||
Two chars: [4243]
|
||||
== UCS-2BE ==
|
||||
Empty: []
|
||||
Empty: []
|
||||
Empty: []
|
||||
Empty: []
|
||||
One char: [0041]
|
||||
Cut in middle of following char: [0041]
|
||||
Two chars: [00410042]
|
||||
== UCS-4BE ==
|
||||
From 1, Length 5: [00000041]
|
||||
From 1, Length 6: [00000041]
|
||||
From 1, Length 8: [0000004100000042]
|
||||
|
||||
Reference in New Issue
Block a user