mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Optimizations for mb_trim (#12803)
* Fast path for when there is nothing to trim in mb_trim * Make mb_trim decide between linear search vs hash table lookup Using empirical experiments I noticed that on my i7-4790 the hash table approach becomes faster once we have more than 4 code points in the trim characters, when evaluated on the worst case. This patch changes the logic so that a hash table is used for a large number of trim characters, and linear search when the number of trim characters is <= 4.
This commit is contained in:
@@ -2951,12 +2951,21 @@ typedef enum {
|
||||
MB_BOTH_TRIM = 3
|
||||
} mb_trim_mode;
|
||||
|
||||
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
|
||||
static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
|
||||
{
|
||||
return zend_hash_index_exists(ht, w);
|
||||
if (ht) {
|
||||
return zend_hash_index_exists(ht, w);
|
||||
} else {
|
||||
for (size_t i = 0; i < default_chars_length; i++) {
|
||||
if (w == default_chars[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
|
||||
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
|
||||
{
|
||||
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
|
||||
uint32_t wchar_buf[128];
|
||||
@@ -2974,7 +2983,7 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
|
||||
|
||||
for (size_t i = 0; i < out_len; i++) {
|
||||
uint32_t w = wchar_buf[i];
|
||||
if (is_trim_wchar(w, what_ht)) {
|
||||
if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
|
||||
if (mode & MB_LTRIM) {
|
||||
left += 1;
|
||||
}
|
||||
@@ -2990,6 +2999,9 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
|
||||
}
|
||||
}
|
||||
|
||||
if (left == 0 && right == 0) {
|
||||
return zend_string_copy(str);
|
||||
}
|
||||
return mb_get_substr(str, left, total_len - (right + left), enc);
|
||||
}
|
||||
|
||||
@@ -3012,7 +3024,7 @@ static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, c
|
||||
for (size_t i = 0; i < trim_default_chars_length; i++) {
|
||||
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
|
||||
}
|
||||
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
|
||||
zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
|
||||
zend_hash_destroy(&what_ht);
|
||||
|
||||
return retval;
|
||||
@@ -3027,18 +3039,32 @@ static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_t
|
||||
size_t what_len = ZSTR_LEN(what);
|
||||
HashTable what_ht;
|
||||
zval val;
|
||||
ZVAL_TRUE(&val);
|
||||
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
|
||||
bool hash_initialized = false;
|
||||
|
||||
while (what_len) {
|
||||
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
|
||||
ZEND_ASSERT(what_out_len <= 128);
|
||||
for (size_t i = 0; i < what_out_len; i++) {
|
||||
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
|
||||
|
||||
if (what_out_len <= 4 && !hash_initialized) {
|
||||
return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
|
||||
} else {
|
||||
if (!hash_initialized) {
|
||||
hash_initialized = true;
|
||||
ZVAL_TRUE(&val);
|
||||
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
|
||||
}
|
||||
for (size_t i = 0; i < what_out_len; i++) {
|
||||
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
|
||||
if (UNEXPECTED(!hash_initialized)) {
|
||||
/* This is only possible if what is empty */
|
||||
return zend_string_copy(str);
|
||||
}
|
||||
|
||||
zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
|
||||
zend_hash_destroy(&what_ht);
|
||||
|
||||
return retval;
|
||||
|
||||
@@ -40,6 +40,15 @@ var_dump(mb_trim(str_repeat(" ", 129)));
|
||||
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
|
||||
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
|
||||
|
||||
echo "== Very long trim characters ==\n";
|
||||
$trim_chars = "";
|
||||
for ($i = 1024; $i < 2048; $i++) {
|
||||
$trim_chars .= mb_chr($i);
|
||||
}
|
||||
var_dump(mb_trim($trim_chars . "hello" . $trim_chars, $trim_chars));
|
||||
var_dump(strlen(mb_ltrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
|
||||
var_dump(strlen(mb_rtrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
|
||||
|
||||
echo "== mb_ltrim ==\n";
|
||||
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
|
||||
echo "== mb_rtrim ==\n";
|
||||
@@ -103,6 +112,10 @@ string(26) " あいうおえお a"
|
||||
string(0) ""
|
||||
string(1) "a"
|
||||
string(388) " a"
|
||||
== Very long trim characters ==
|
||||
string(5) "hello"
|
||||
int(2053)
|
||||
int(2053)
|
||||
== mb_ltrim ==
|
||||
string(15) "いああああ"
|
||||
== mb_rtrim ==
|
||||
|
||||
Reference in New Issue
Block a user